From 5d63af1b4ace43c22433ae1f135b92e113a32082 Mon Sep 17 00:00:00 2001 From: Denis Samoilov Date: Mon, 23 Nov 2020 16:40:22 -0800 Subject: [PATCH] gpu: add nvidia support --- LICENSE | 1 + README.md | 13 + THIRD-PARTY-PROGRAMS | 1 + cmake/FindPI_CUDA.cmake | 31 + cmake/FindcuBLAS.cmake | 45 + cmake/FindcuDNN.cmake | 55 ++ cmake/options.cmake | 11 + examples/CMakeLists.txt | 8 + src/common/dnnl_thread.hpp | 2 +- src/common/memory_tracking.hpp | 5 + src/cpu/cpu_stream.hpp | 1 - src/gpu/CMakeLists.txt | 5 + src/gpu/nvidia/CMakeLists.txt | 51 + src/gpu/nvidia/README.md | 330 +++++++ src/gpu/nvidia/cudnn_batch_normalization.cpp | 38 + src/gpu/nvidia/cudnn_batch_normalization.hpp | 198 ++++ .../cudnn_batch_normalization_executor.hpp | 549 +++++++++++ .../nvidia/cudnn_batch_normalization_impl.hpp | 347 +++++++ src/gpu/nvidia/cudnn_binary.cpp | 58 ++ src/gpu/nvidia/cudnn_binary.hpp | 125 +++ src/gpu/nvidia/cudnn_binary_impl.hpp | 143 +++ src/gpu/nvidia/cudnn_concat.cpp | 42 + .../cudnn_conv_filter_adjustment_base.hpp | 169 ++++ src/gpu/nvidia/cudnn_conv_inner_product.hpp | 396 ++++++++ .../nvidia/cudnn_conv_inner_product_impl.hpp | 701 ++++++++++++++ src/gpu/nvidia/cudnn_convolution.cpp | 256 +++++ src/gpu/nvidia/cudnn_convolution.hpp | 333 +++++++ src/gpu/nvidia/cudnn_convolution_impl.hpp | 900 ++++++++++++++++++ src/gpu/nvidia/cudnn_convolution_pd.hpp | 77 ++ src/gpu/nvidia/cudnn_deconvolution.cpp | 57 ++ src/gpu/nvidia/cudnn_deconvolution.hpp | 476 +++++++++ src/gpu/nvidia/cudnn_deconvolution_impl.hpp | 92 ++ src/gpu/nvidia/cudnn_eltwise.cpp | 85 ++ src/gpu/nvidia/cudnn_eltwise.hpp | 116 +++ src/gpu/nvidia/cudnn_eltwise_impl.hpp | 203 ++++ src/gpu/nvidia/cudnn_gemm_inner_product.hpp | 347 +++++++ .../nvidia/cudnn_gemm_inner_product_impl.hpp | 463 +++++++++ src/gpu/nvidia/cudnn_inner_product.cpp | 238 +++++ src/gpu/nvidia/cudnn_inner_product.hpp | 90 ++ src/gpu/nvidia/cudnn_inner_product_impl.hpp | 191 ++++ src/gpu/nvidia/cudnn_lrn.cpp | 89 ++ src/gpu/nvidia/cudnn_lrn.hpp | 132 +++ src/gpu/nvidia/cudnn_lrn_impl.hpp | 201 ++++ src/gpu/nvidia/cudnn_matmul.cpp | 87 ++ src/gpu/nvidia/cudnn_matmul.hpp | 151 +++ src/gpu/nvidia/cudnn_matmul_executor.hpp | 300 ++++++ src/gpu/nvidia/cudnn_matmul_impl.hpp | 403 ++++++++ src/gpu/nvidia/cudnn_pooling.cpp | 157 +++ src/gpu/nvidia/cudnn_pooling.hpp | 200 ++++ src/gpu/nvidia/cudnn_pooling_impl.hpp | 234 +++++ src/gpu/nvidia/cudnn_reorder.cpp | 55 ++ src/gpu/nvidia/cudnn_reorder.hpp | 122 +++ src/gpu/nvidia/cudnn_reorder_impl.cpp | 46 + src/gpu/nvidia/cudnn_reorder_impl.hpp | 182 ++++ src/gpu/nvidia/cudnn_resampling.cpp | 94 ++ src/gpu/nvidia/cudnn_resampling.hpp | 269 ++++++ src/gpu/nvidia/cudnn_resampling_impl.hpp | 171 ++++ src/gpu/nvidia/cudnn_softmax.cpp | 85 ++ src/gpu/nvidia/cudnn_softmax.hpp | 116 +++ src/gpu/nvidia/cudnn_softmax_impl.hpp | 255 +++++ src/gpu/nvidia/cudnn_sum.cpp | 41 + src/gpu/nvidia/cudnn_sum.hpp | 70 ++ src/gpu/nvidia/sycl_cuda_engine.cpp | 199 ++++ src/gpu/nvidia/sycl_cuda_engine.hpp | 121 +++ src/gpu/nvidia/sycl_cuda_scoped_context.cpp | 63 ++ src/gpu/nvidia/sycl_cuda_scoped_context.hpp | 60 ++ src/gpu/nvidia/sycl_cuda_stream.cpp | 126 +++ src/gpu/nvidia/sycl_cuda_stream.hpp | 81 ++ src/gpu/nvidia/sycl_cuda_utils.hpp | 522 ++++++++++ src/gpu/ocl/ref_sum.hpp | 2 + tests/benchdnn/binary/binary.cpp | 8 + tests/benchdnn/bnorm/bnorm.cpp | 23 + tests/benchdnn/conv/conv.cpp | 41 + tests/benchdnn/conv/deconv.cpp | 43 + tests/benchdnn/dnnl_common.cpp | 45 + tests/benchdnn/dnnl_common.hpp | 8 + tests/benchdnn/dnnl_memory.hpp | 15 + tests/benchdnn/eltwise/eltwise.cpp | 10 +- .../inputs/resampling/test_resampling_all | 1 - tests/benchdnn/ip/ip.cpp | 23 + tests/benchdnn/lnorm/lnorm.cpp | 6 + tests/benchdnn/lrn/lrn.cpp | 8 + tests/benchdnn/matmul/matmul.cpp | 25 + tests/benchdnn/pool/pool.cpp | 23 + tests/benchdnn/reduction/reduction.cpp | 5 + tests/benchdnn/reorder/reorder.cpp | 8 + tests/benchdnn/resampling/resampling.cpp | 19 +- tests/benchdnn/rnn/rnn.cpp | 5 + tests/benchdnn/shuffle/shuffle.cpp | 6 + tests/gtests/CMakeLists.txt | 2 +- tests/gtests/api/CMakeLists.txt | 7 +- tests/gtests/api/test_memory_creation.cpp | 6 + tests/gtests/api/test_namespace.cpp | 29 + tests/gtests/dnnl_test_common.hpp | 32 +- tests/gtests/dnnl_test_macros.hpp | 21 + .../test_batch_normalization_common.hpp | 34 + tests/gtests/test_binary.cpp | 28 +- tests/gtests/test_concat.cpp | 16 + .../test_convolution_backward_data_common.hpp | 45 + ...st_convolution_backward_weights_common.hpp | 45 + ...est_convolution_eltwise_forward_common.hpp | 50 + tests/gtests/test_convolution_format_any.cpp | 3 + .../test_convolution_forward_common.hpp | 39 + tests/gtests/test_deconvolution.cpp | 39 +- tests/gtests/test_eltwise.cpp | 26 + tests/gtests/test_gemm_common.hpp | 1 + tests/gtests/test_iface_pd_iter.cpp | 1 + tests/gtests/test_iface_runtime_attr.cpp | 8 + tests/gtests/test_iface_wino_convolution.cpp | 4 +- .../test_inner_product_backward_data.cpp | 33 +- .../test_inner_product_backward_weights.cpp | 39 + tests/gtests/test_inner_product_forward.cpp | 37 + tests/gtests/test_layer_normalization.cpp | 1 + tests/gtests/test_logsoftmax.cpp | 13 + tests/gtests/test_lrn_backward.cpp | 26 +- tests/gtests/test_lrn_forward.cpp | 13 + tests/gtests/test_matmul.cpp | 16 +- tests/gtests/test_pooling_backward.cpp | 19 + tests/gtests/test_pooling_forward.cpp | 42 +- tests/gtests/test_reorder_common.hpp | 27 + tests/gtests/test_resampling.cpp | 11 + tests/gtests/test_shuffle.cpp | 1 + tests/gtests/test_softmax.cpp | 12 + tests/gtests/test_sum.cpp | 19 +- 124 files changed, 12919 insertions(+), 31 deletions(-) create mode 100644 cmake/FindPI_CUDA.cmake create mode 100644 cmake/FindcuBLAS.cmake create mode 100644 cmake/FindcuDNN.cmake create mode 100644 src/gpu/nvidia/CMakeLists.txt create mode 100644 src/gpu/nvidia/README.md create mode 100644 src/gpu/nvidia/cudnn_batch_normalization.cpp create mode 100644 src/gpu/nvidia/cudnn_batch_normalization.hpp create mode 100644 src/gpu/nvidia/cudnn_batch_normalization_executor.hpp create mode 100644 src/gpu/nvidia/cudnn_batch_normalization_impl.hpp create mode 100644 src/gpu/nvidia/cudnn_binary.cpp create mode 100644 src/gpu/nvidia/cudnn_binary.hpp create mode 100644 src/gpu/nvidia/cudnn_binary_impl.hpp create mode 100644 src/gpu/nvidia/cudnn_concat.cpp create mode 100644 src/gpu/nvidia/cudnn_conv_filter_adjustment_base.hpp create mode 100644 src/gpu/nvidia/cudnn_conv_inner_product.hpp create mode 100644 src/gpu/nvidia/cudnn_conv_inner_product_impl.hpp create mode 100644 src/gpu/nvidia/cudnn_convolution.cpp create mode 100644 src/gpu/nvidia/cudnn_convolution.hpp create mode 100644 src/gpu/nvidia/cudnn_convolution_impl.hpp create mode 100644 src/gpu/nvidia/cudnn_convolution_pd.hpp create mode 100644 src/gpu/nvidia/cudnn_deconvolution.cpp create mode 100644 src/gpu/nvidia/cudnn_deconvolution.hpp create mode 100644 src/gpu/nvidia/cudnn_deconvolution_impl.hpp create mode 100644 src/gpu/nvidia/cudnn_eltwise.cpp create mode 100644 src/gpu/nvidia/cudnn_eltwise.hpp create mode 100644 src/gpu/nvidia/cudnn_eltwise_impl.hpp create mode 100644 src/gpu/nvidia/cudnn_gemm_inner_product.hpp create mode 100644 src/gpu/nvidia/cudnn_gemm_inner_product_impl.hpp create mode 100644 src/gpu/nvidia/cudnn_inner_product.cpp create mode 100644 src/gpu/nvidia/cudnn_inner_product.hpp create mode 100644 src/gpu/nvidia/cudnn_inner_product_impl.hpp create mode 100644 src/gpu/nvidia/cudnn_lrn.cpp create mode 100644 src/gpu/nvidia/cudnn_lrn.hpp create mode 100644 src/gpu/nvidia/cudnn_lrn_impl.hpp create mode 100644 src/gpu/nvidia/cudnn_matmul.cpp create mode 100644 src/gpu/nvidia/cudnn_matmul.hpp create mode 100644 src/gpu/nvidia/cudnn_matmul_executor.hpp create mode 100644 src/gpu/nvidia/cudnn_matmul_impl.hpp create mode 100644 src/gpu/nvidia/cudnn_pooling.cpp create mode 100644 src/gpu/nvidia/cudnn_pooling.hpp create mode 100644 src/gpu/nvidia/cudnn_pooling_impl.hpp create mode 100644 src/gpu/nvidia/cudnn_reorder.cpp create mode 100644 src/gpu/nvidia/cudnn_reorder.hpp create mode 100644 src/gpu/nvidia/cudnn_reorder_impl.cpp create mode 100644 src/gpu/nvidia/cudnn_reorder_impl.hpp create mode 100644 src/gpu/nvidia/cudnn_resampling.cpp create mode 100644 src/gpu/nvidia/cudnn_resampling.hpp create mode 100644 src/gpu/nvidia/cudnn_resampling_impl.hpp create mode 100644 src/gpu/nvidia/cudnn_softmax.cpp create mode 100644 src/gpu/nvidia/cudnn_softmax.hpp create mode 100644 src/gpu/nvidia/cudnn_softmax_impl.hpp create mode 100644 src/gpu/nvidia/cudnn_sum.cpp create mode 100644 src/gpu/nvidia/cudnn_sum.hpp create mode 100644 src/gpu/nvidia/sycl_cuda_engine.cpp create mode 100644 src/gpu/nvidia/sycl_cuda_engine.hpp create mode 100644 src/gpu/nvidia/sycl_cuda_scoped_context.cpp create mode 100644 src/gpu/nvidia/sycl_cuda_scoped_context.hpp create mode 100644 src/gpu/nvidia/sycl_cuda_stream.cpp create mode 100644 src/gpu/nvidia/sycl_cuda_stream.hpp create mode 100644 src/gpu/nvidia/sycl_cuda_utils.hpp create mode 100644 tests/gtests/api/test_namespace.cpp diff --git a/LICENSE b/LICENSE index 6e669453225..e2b21ebb4e7 100644 --- a/LICENSE +++ b/LICENSE @@ -180,6 +180,7 @@ Copyright 2016-2020 Intel Corporation Copyright 2018 YANDEX LLC Copyright 2020 Arm Limited and affiliates + Copyright 2020 Codeplay Software Limited Copyright 2019-2020 FUJITSU LIMITED Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/README.md b/README.md index e2ec38e0ec3..71b4032f059 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,7 @@ The library is optimized for Intel Architecture Processors, Intel Processor Graphics and Xe architecture-based Graphics. oneDNN has experimental support for the following architectures: * Arm\* 64-bit Architecture (AArch64) +* NVIDIA\* GPU * OpenPOWER\* Power ISA (PPC64) * IBMz\* (s390x) @@ -190,6 +191,18 @@ is enabled: * [Intel oneAPI DPC++ Compiler](https://software.intel.com/en-us/oneapi/dpc-compiler) Beta * OpenCL runtime library (OpenCL version 1.2 or later) * [oneAPI Level Zero](https://github.com/oneapi-src/level-zero) +* DPCPP runtime with NVIDIA GPU support requires + * [oneAPI DPC++ Compiler](https://github.com/intel/llvm) + * OpenCL runtime library (OpenCL version 1.2 or later) + * NVIDIA CUDA\* driver + * cuBLAS 10.1 or later + * cuDNN 7.6 or later + +> **WARNING** +> +> NVIDIA GPU support is experimental. General information, build instructions +> and implementation limitations is available in +> [NVIDIA backend readme](https://github.com/oneapi-src/oneDNN/blob/master/src/gpu/NVIDIA/README.md). ### Runtime Dependencies diff --git a/THIRD-PARTY-PROGRAMS b/THIRD-PARTY-PROGRAMS index b5dbd34a780..f9c90d72fcb 100644 --- a/THIRD-PARTY-PROGRAMS +++ b/THIRD-PARTY-PROGRAMS @@ -178,6 +178,7 @@ Copyright (c) 2015-2017 Martin Hensel Copyright (c) 2007, Apostolos Syropoulos (;$" + COMPILE_OPTIONS + "$;$;$" +) +target_include_directories( + ${OBJ_LIB} + PRIVATE $ + $ + $) + +add_library(${OBJ_LIB}_interface INTERFACE) +target_link_libraries(${OBJ_LIB}_interface INTERFACE cuBLAS::cuBLAS + cuDNN::cuDNN + OpenCL::OpenCL) +set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS + $) + +set(${LIB_NAME}_INTERFACE + ${${LIB_NAME}_INTERFACE} ${OBJ_LIB}_interface + PARENT_SCOPE) diff --git a/src/gpu/nvidia/README.md b/src/gpu/nvidia/README.md new file mode 100644 index 00000000000..41b0de00cb2 --- /dev/null +++ b/src/gpu/nvidia/README.md @@ -0,0 +1,330 @@ +# Nvidia backend support + +## General information + +The Nvidia backend for oneDNN can be exposed to the user via the +`dnnl::engine::kind::gpu` engine kind. Currently, for the case when user's +system has both Intel and Nvidia GPUs, `DNNL_GPU_VENDOR=NVIDIA` flag is used in +CMake, since the devices are clustered based on the device vendor ID and index +pattern can not be used to distinguish between Intel GPU and Nvidia GPU. +However, Intel is working on restructuring the engine creation, so that it would +be possible to choose engine kind and vendor kind at runtime. Also, it is +possible to create oneDNN engines using `sycl::device` objects corresponding to +Nvidia GPUs. The stream in Nvidia backend for oneDNN defines an out-of-order +SYCL queue by default. Similar to the existing oneDNN API, user can specify an +in-order queue when creating a stream if needed. + +## Build command + +```bash +export CC=/path/to/dpcpp/install/bin/clang +export CXX=/path/to/dpcpp/install/bin/clang++ +mkdir build +cd build +cmake -DDNNL_CPU_RUNTIME=DPCPP -DDNNL_GPU_RUNTIME=DPCPP \ + -DDNNL_GPU_VENDOR=NVIDIA -G Ninja \ + -DOPENCLROOT=/path/to/the/root/folder/of/libOpenCL.so .. +``` + +## Memory + +Currently, only the buffer-based oneDNN API is supported for Nvidia backend. + +## Suported Data Types + +The following table documents the supported data types. + +| Data Type | Computation Mode | +|-----------|-----------------------------| +| f32 | Training, Inference | +| f16 | Inference | +| s8 | Inference (when applicable) | + +## Supported Primitives and Implementation Limitations + +cuDNN functions are not necessarily the same as oneDNN primitives due to lack of +standard API for DNN. For each primitive the cuDNN equivalent function is added +to the Nvidia backend for oneDNN. However, the added backend cannot provide all +functionalities supported by oneDNN primitives. The detailed limitations of each +cuDNN primitive are explained as follow. + +### Batch normalization + +The closest equivalent to oneDNN batch normalization can be +`cudnnBatchNormalizationForward` and `cudnnBatchNormalizationBackward` +operations. However, there are some difference between cuDNN and oneDNN batch +normalization. + +#### Forward direction + +* When `global_stats` flag is set for batch normalization, the mean and variance + are input only parameters. However, cuDNN does not have the option to accept + the mean and variance as inputs in the forward training operation. Therefore, + `cudnnBatchNormalizationForwardInference` is used to match the oneDNN feature. + Although inference is not supported without `global_stats` flags set. +* The cuDNN precision is different from that of oneDNN for Batch Normalization. + (e.g `fp:0.0170898 dt:0.0170907 diff:8.27014e-07 rdiff:4.83922e-05`) +* The forward training with no flags accepts mean and variance as an output. + However, in cuDNN the mean and variance are running mean and variance + respectably so they are both input and output variable. Therefore, they are + required to have a sensible value (cannot be NaN). Since oneDNN will not set + value for the mean and variance when no flag is passed, the NaN can be + propagated as a result. To avoid NaN propagation, `cudaMemset` function is + used to initialize the mean and variance with zero. +* cuDNN always requires the values for scale and shift. When shift and scale are + not defined in oneDNN, `cudaMemset` is used to initialize scale to 1 and shift + to 0. +* For performance reason in the backward pass, cuDNN requires the mean and + inverse variance to be saved in the forward pass. Therefore, when Nvidia + backend is used for batch normalization, the workspace must be provided to + save the mean and inverse variance. +* When `dnnl_fuse_norm_relu` flag is set for batch normalization, the + `cudnnActivationForward` operation is called immediately after the batch + normalization, since cuDNN does not have a fused batch normalization with + `RELU`. The implementation for element-wise post operations is the same. +* When `dnnl_fuse_norm_relu` is used, the intermediate output of batch + normalization, which is used as an input to the activation function, is saved + in the workspace as well. This is required to compute the backward pass for + `dnnl_fuse_norm_relu` flag. +* Forward pass supports f32, f16 and s8 data types. Although blocking is not + supported for s8. + +#### Backward direction + +* cuDNN uses `alpha` and `beta` parameters to blend the `dy`, `shift` and + `scale`. Since oneDNN does not have this feature, the `alpha` and `beta` + values in the backward direction are set to 1 and 0 respectively to avoid + blending. +* Nvidia backend for backward direction requires the workspace as an input + containing the mean and inverse variance computed in the forward pass. +* The Nvidia backend for oneDNN does not support the backward direction for + batch normalization when the flag is set to `global_stats`. This is due to the + fact that oneDNN will skip the +

+ +

+ since the mean and variance are constant, however, cuDNN does not have an + option to skip this operation. +* When `dnnl_fuse_norm_relu` flag is set, Nvidia backend requires the + intermediate result of the batch normalization saved in the forward pass. This + is used to compute the backward direction of the activation function used for + `RELU`. + +### Binary + +The `cudnnOpTensor` is equivalent of oneDNN binary primitives. + +* Only scales attribute is supported. Post-op attribute is not supported. +* Blocking is only supported for `int8` and only in the C dimension with either + 4 or 32 block size (same as other cuDNN primitives). + +### Concat + +The concat operation uses the reorder primitive to concatenate tensors over the +chosen dimension, so the same limitation as reorder applies here. + +### Convolution + +The `cudnnConvolutionForward`, `cudnnConvolutionBackward` and +`cudnnConvolutionBackwardFilter` is used to compute forward, backward by data or +backward by weights for a convolution operation. + +* Blocking is only supported for `int8` and only in the C dimension with block + size of 4. Input and output tensors must have the same data type. +* For int8 (s8s8s8) with post-ops the operations are performed as s8s8f32 (due + to cuDNN limitations) then reordered to `s8` at the end which impacts + performance. +* Direct convolution is not supported, so implicit GEMM is used in those cases. +* "Left" padding must be greater or equal to "right" padding, and the requested + spatial output should match the output formula for two "left" padding used. +* Eltwise post-op limitations are the same as our eltwise limitation as post-ops + are not fused. +* cuDNN requires padding tensors to 4 dimensions, so 1D convolutions are + supported but are performed as 2D. + +The following table shows the convolution status for the oneDNN Nvidia backend: + +#### Forward direction +| Weights Format | Winograd Supported | Supported Input Format | Supported Output Format | Supported Data Type | Limitations | +|----------------|--------------------|------------------------|-------------------------|---------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| 2D NCHW | YES | NCHW, NHWC | NCHW, NHWC | f32, f16 | The Winograd algorithm has limitations:
* Filter size must be 3x3 or 5x5.
* Dilation must be zero for all dimensions.
* Horizontal and vertical filter stride must be 1. | +| 2D NHWC | NO | NHWC | NHWC | f32, f16, int8 | * Dilation must be zero in all dimensions.
* Output feature maps must be multiple of 4 for `int8` type. | +| 3D NCHW | NO | NCHW, NHWC | NCHW, NHWC | f32, f16 | | + +#### Backward direction +| Weights Format | Winograd Supported | Supported Input Format | Supported Output Format | Supported Data Type | Limitations | +|----------------|--------------------|------------------------|-------------------------|---------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| 2D NCHW | YES | NCHW, NHWC | NCHW | f32, f16 | 1. Dilation must be zero for all dimensions.
2. The Winograd algorithm has limitations:
* Filter size must be 3x3 or 5x5.
* Dilation must be zero for all dimensions.
* Horizontal and vertical filter stride must be 1. | +| 2D NHWC | NO | NHWC | NHWC | f32, f16 | | +| 3D NCHW | NO | NCHW, NHWC | NCHW | f32, f16 | | +### Deconvolution + +Deconvolution primitive is implemented through the convolution with swapped +input abd output channels. + +* Currently, there is a bug, likely in this code, which causes crashes in + memory_tracking for 3D backward_weights with bias when backward_weights + without bias was also a part of the run. Cache interrogation is suspected due + to cache-free runs are successful. Switched off in benchdnn until further + investigation and the fix. + +### Eltwise + +The `cudnnActivationForward` and `cudnnActivationBackward` is the equivalent of +eltwise forward and eltwise backward in oneDNN respectively. There are some +limitations when using Nvidia backend for eltwise primitive: + +* cuDNN only supports the following operations - `RELU`, `ELU`, `TANH`, + `LOGISTIC` and `BRELU`. +* `RELU` is only supported with alpha = 0. +* cuDNN expects `x`, `y` and `dy` as inputs to the backward pass, hence, only + `RELU` and `BRELU` operations are supported in the backward pass. + TODO: add `ELU_DST`, `TANH_DST` and `LOGISTIC_DST` support which require `dy`. +* Forward pass supports `f32`, `f16` and `s8` data types. Although blocking is + not supported for `s8`. +* Backward pass supports `f32` and `f16` data types. + +### Inner product + +The inner product primitives is an implementation of matrix multiplication plus +bias activation. There are two implementation of inner product in cuDNN backend. + +#### Using GEMM + +The default backend for inner product is the gemm backend using `cublasGemmEx` +for forward, backward data, and backward weight and `cudnnReduceTensor` for +backward bias. A function called `gemm_consitency_check()`, `dense_check()` is +used to see if the gemm backend can be used for inner product. `reorder_check()` +is used when reorder is required. If none of the above condition are met, it +falls back to the convolution backend. `cudnnActivationForward` operation is +used for eltwise operation and `cudnnAddTensor` is used for bias operation. The +`beta` parameter in gemm is used for the sum scale and `alpha` parameter is used +for the output scale. + +#### Using convolution + +For the forward direction, this operation can be implemented by using +`cudnnConvolutionBiasActivation` by converting the inner product to `1x1` +convolution. For the backward direction the inner product operation will be +equivalent of `cudnnConvolutionBackwardData`, `cudnnConvolutionBackwardWeights` +and `cudnnConvolutionBackwardBias` when applied. This implementation of inner +product has the following restrictions and performance implications: + +* The only blocked layouts are those that are supported in cuDNN - namely that + the blocking is done on the C dimension, the block size is 4, and only for + `int8` inference. The additional requirement is that both the input and filter + must be blocked. +* The `ReLU` and sum are supported as a fused post-op, for other post-op a + separate call to eltwise primitive is performed. So the limitation for the + eltwise primitive is applied here. +* Only `mask = 0` case is supported for output scale. +* The restrictions for the convolution primitive are applied here for input and + filter format. When required, the filter is internally reordered to match the + convolution restriction. +* For `int8` cuDNN requires both input and output feature maps to be a multiple + of 4. + +### LRN + +The local response normalization primitive in the Nvidia backend is implemented +with the `cudnnLRNForward` and `cudnnLRNBackward` functions for forward and +backward propagation respectively. + +* `WITHIN` algorithm is not supported. +* There is a difference in the LRN algorithm used in oneDNN and cuDNN which + causes a mismatch when the local size is even. +* cuDNN supports NCHW tensor formats for all valid dimensions. However, it does + not support the NHWC tensor format for above 5 dimensions. + +### Matrix Multiplication + +The matrix multiplication primitive in the Nvidia backend is implemented with +`cublasGemmEx` and `cublasGemmStridedBatchedEx` functions. + +* Zero points support is not provided by cuBLAS and, hence, not supported by the + Nvidia backend. +* Post-ops and output scale limitations are same as for Inner Product. + +### Pooling + +The pooling primitive in the Nvidia backend is implemented with the +`cudnnPoolingForward` and `cudnnPoolingBackward` functions for forward and +backward propagation respectively. + +* cuDNN only allows the use of symmetric padding, i.e. padding at the beginning + of a dimension must be the same as the padding at the end of that dimension. + oneDNN doesn't have this limitation. Therefore, + + - Configurations where padding in the beginning is larger than padding at + the end are supported and work as expected. + - For configurations where padding at the end is larger than padding in the + beginning of any dimension, the primitive returns `status::unimplemented`. + +* For backward propagation cuDNN requires the parameters `x`, `y`, `dx` and + `dy`, while oneDNN requires only `dx`, `dy` and workspace when the `MAX` + algorithm is used. Hence, the workspace is used to store the `x` and `y` + parameters in the forward pass for the Nvidia backend. Therefore, the + workspace is always required when the Nvidia backend is used (except for the + forward inference). + +### Reorder + +The `cudnnTransform` function is the equivalent of oneDNN reorder function. +However, there are some limitations when using SYCL_API-DNN reorder on Nvidia +GPU: + +* Per dimension scaling is not supported (a single alpha and beta value is + accepted by the transform tensor function). +* Blocking is only permitted for the channel dimension in cuDNN. This primitive + currently supports block size of 4. +* Blocking is only supported when channel dimension is a multiple of the block + size and the datatype is `int8`. + +### Resampling + +The `cudnnSpatialTfSamplerForward` and `cudnnSpatialTfSamplerBackward` are used +to implement the resampling primitive. + +The Nvidia's spatial sampling is based on +[Spacial Transformer Network](https://papers.nips.cc/paper/5854-spatial-transformer-networks.pdf) +where all the data locations are normalized between `-1 <= (xi, yi) <= 1`. + +* cuDNN backend requires a grid of coordinates that can be sample-up/down based + on `theta`. The grid is generated by `cudnnSpatialTfGridGeneratorForward`. +* The `theta` is a `MB * 2 * 3` matrix scaling factor for each coordinate and is + used to generate the grid. +* The grid value must be normalized in range [-1 , 1]. cuDNN clamps the out of + bounds coordinate to zero. Therefore, it is needed to manually clamp the out + of bound coordinate to edges in order to avoid incorrect result. +* 3D spatial sampling is not supported in cuDNN. +* `Nearest neighbour` algorithm is not supported in cuDNN. +* Since cuDNN computation is different from that of oneDNN, the error threshold + is smaller than other oneDNN implementation, so reduced testing accuracy for + `fp32` and `fp16` data types are required. +* The backward pass requires an output parameter for `d_grid` which cannot be + `nullptr`. However, since the grid coordinates are not a tunable parameter in + oneDNN, a dummy memory for `d_grid` is created and is deleted when the + destructor of the primitive is called. + +### Softmax/LogSoftmax + +The `cudnnSoftmaxForward` and `cudnnSoftmaxBackward` are used to implement the +softmax primitive. For logsoftmax primitive the same functions will be used and +the algorithm selection in cuDNN for the above mentioned functions will be +changed to `CUDNN_SOFTMAX_LOG`. + +* The softmax axis is supported for only the channel dimension, (i.e., axis=1). +* There is a bug in cuDNN softmax for 5D tensor with format `NHWC`. When the + channel size is greater than 1, it only applies softmax for a single channel + and leave the others untouched. + +### Sum + +The sum operation uses the reorder primitive to sum tensors, so the same +limitation as reorder applies here. + +### Other primitives + +Rest primitives not listed above are not supported by Nvidia backend. This is +likely due to either missed functionality in cuDNN or cuBLAS, or lack of +priority in supporting of such functionality. diff --git a/src/gpu/nvidia/cudnn_batch_normalization.cpp b/src/gpu/nvidia/cudnn_batch_normalization.cpp new file mode 100644 index 00000000000..1b1c8231c66 --- /dev/null +++ b/src/gpu/nvidia/cudnn_batch_normalization.cpp @@ -0,0 +1,38 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "gpu/nvidia/cudnn_batch_normalization.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +status_t cudnn_batch_normalization_fwd_t::execute(const exec_ctx_t &ctx) const { + return cudnn_batch_normalization_common_t::execute( + ctx, ctx.stream()->engine(), pd()); +} + +status_t cudnn_batch_normalization_bwd_t::execute(const exec_ctx_t &ctx) const { + return cudnn_batch_normalization_common_t::execute( + ctx, ctx.stream()->engine(), pd()); +} + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl diff --git a/src/gpu/nvidia/cudnn_batch_normalization.hpp b/src/gpu/nvidia/cudnn_batch_normalization.hpp new file mode 100644 index 00000000000..4ed74cd1bbf --- /dev/null +++ b/src/gpu/nvidia/cudnn_batch_normalization.hpp @@ -0,0 +1,198 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_NVIDIA_CUDNN_BATCH_NORMALIZATION_HPP +#define GPU_NVIDIA_CUDNN_BATCH_NORMALIZATION_HPP + +#include +#include + +#include "common/batch_normalization_pd.hpp" +#include "common/c_types_map.hpp" +#include "common/primitive.hpp" +#include "common/type_helpers.hpp" +#include "gpu/nvidia/cudnn_batch_normalization_executor.hpp" +#include "gpu/nvidia/cudnn_batch_normalization_impl.hpp" +#include "gpu/nvidia/sycl_cuda_engine.hpp" +#include "gpu/nvidia/sycl_cuda_stream.hpp" +#include "gpu/nvidia/sycl_cuda_utils.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +struct cudnn_batch_normalization_common_t { + template + static status_t execute( + const exec_ctx_t &ctx, engine_t *engine, const pd_t *pd) { + if (memory_desc_wrapper(pd->src_md()).has_zero_dim()) + return status::success; + return pd->executor_->execute(ctx, engine, pd->bnorm_impl_); + } + + template + static void init_ws(const pd_t *pd, memory_desc_t &ws_md) { + const auto wrap = memory_desc_wrapper(pd->src_md()); + const auto y_size = wrap.nelems(); + const size_t mean_invvar_size = 2 * pd->C(); + const dims_t ws_size + = {(dim_t)(y_size * pd->fuse_norm_relu() + mean_invvar_size)}; + + dnnl_memory_desc_init_by_tag( + &ws_md, 1, ws_size, wrap.data_type(), format_tag::x); + } +}; + +struct cudnn_batch_normalization_fwd_t : public primitive_t { + struct pd_t : public batch_normalization_fwd_pd_t { + pd_t(const batch_normalization_desc_t *adesc, + const primitive_attr_t *attr, + const batch_normalization_fwd_pd_t *hint_fwd_pd) + : batch_normalization_fwd_pd_t(adesc, attr, hint_fwd_pd) {} + + DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_batch_normalization_fwd_t); + + status_t init(engine_t *) { + using namespace data_type; + using namespace types; + + auto src_dt = src_md()->data_type; + const auto attr_skip_mask = primitive_attr_t::skip_mask_t::post_ops; + + bool ok = true && is_fwd() && utils::one_of(src_dt, f16, f32, s8) + && attr()->has_default_values(attr_skip_mask) + && IMPLICATION(!attr()->has_default_values(), + attr()->post_ops_.len() == 1 && with_relu_post_op()) + && IMPLICATION(utils::one_of(src_dt, s8, f16), + !is_training() && stats_is_src()) + && src_md()->format_desc.blocking.inner_nblks == 0; + if (!ok) return status::unimplemented; + + if (is_training()) { + cudnn_batch_normalization_common_t::init_ws(this, ws_md_); + } + + if (use_global_stats()) { + bnorm_impl_.reset( + new cudnn_batch_normalization_fwd_stats_impl_t()); + } else { + bnorm_impl_.reset(new cudnn_batch_normalization_fwd_impl_t()); + } + + if (!is_training() && !use_global_stats() && !use_scaleshift()) { + executor_.reset(new bnorm_exec_fwd_inf_t()); + } else if (!is_training() && use_scaleshift() + && !use_global_stats()) { + executor_.reset(new bnorm_exec_fwd_inf_ss_t()); + } else if (!use_scaleshift() && !use_global_stats()) { + executor_.reset(new bnorm_exec_fwd_t()); + } else if (use_scaleshift() && !use_global_stats()) { + executor_.reset(new bnorm_exec_fwd_ss_t); + } else if (!use_scaleshift() && use_global_stats()) { + // Same for training and inference + executor_.reset(new bnorm_exec_fwd_inf_stats_t()); + } else if (use_scaleshift() && use_global_stats()) { + // Same for training and inference + executor_.reset(new bnorm_exec_fwd_inf_ss_stats_t()); + } else { + return status::unimplemented; + } + + return bnorm_impl_->init(this); + } + + std::shared_ptr bnorm_impl_; + std::shared_ptr executor_; + }; + + cudnn_batch_normalization_fwd_t(const pd_t *apd) : primitive_t(apd) {} + + status_t execute(const exec_ctx_t &ctx) const override; + +private: + const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } +}; + +struct cudnn_batch_normalization_bwd_t : public primitive_t { + + struct pd_t : public batch_normalization_bwd_pd_t { + pd_t(const batch_normalization_desc_t *adesc, + const primitive_attr_t *attr, + const batch_normalization_fwd_pd_t *hint_fwd_pd) + : batch_normalization_bwd_pd_t(adesc, attr, hint_fwd_pd) {} + + DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_batch_normalization_bwd_t); + + status_t init(engine_t *) { + using namespace data_type; + using namespace types; + + bool ok = true && is_bwd() && set_default_formats_common() + && IMPLICATION( + desc()->prop_kind == prop_kind::backward_data, + !use_scaleshift()) + && (utils::everyone_is( + f32, src_md()->data_type, diff_src_md()->data_type)) + && attr()->has_default_values() && !use_global_stats() + && src_md()->format_desc.blocking.inner_nblks == 0 + && diff_src_md()->format_desc.blocking.inner_nblks == 0; + if (!ok) return status::unimplemented; + + cudnn_batch_normalization_common_t::init_ws(this, ws_md_); + if (!compare_ws(hint_fwd_pd_)) return status::unimplemented; + + if (fuse_norm_relu()) { + bnorm_impl_.reset( + new cudnn_batch_normalization_bwd_relu_impl_t()); + } else { + bnorm_impl_.reset(new cudnn_batch_normalization_bwd_impl_t()); + } + + bool is_bwd_d = desc()->prop_kind == prop_kind::backward_data; + if (!is_bwd_d && use_scaleshift() && !use_global_stats()) { + executor_.reset(new bnorm_exec_bwd_dw_ss_t); + } else if (is_bwd_d && use_scaleshift() && !use_global_stats()) { + executor_.reset(new bnorm_exec_bwd_d_ss_t); + } else if (!use_scaleshift() && !use_global_stats()) { + // Same for bwd_d and bwd_dw + executor_.reset(new bnorm_exec_bwd_t()); + } else { + return status::unimplemented; + } + + return bnorm_impl_->init(this); + } + + std::shared_ptr bnorm_impl_; + std::shared_ptr executor_; + }; + + cudnn_batch_normalization_bwd_t(const pd_t *apd) : primitive_t(apd) {} + + status_t execute(const exec_ctx_t &ctx) const override; + +private: + const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } +}; + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/nvidia/cudnn_batch_normalization_executor.hpp b/src/gpu/nvidia/cudnn_batch_normalization_executor.hpp new file mode 100644 index 00000000000..bfe5060f13f --- /dev/null +++ b/src/gpu/nvidia/cudnn_batch_normalization_executor.hpp @@ -0,0 +1,549 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_NVIDIA_CUDNN_BATCH_NORMALIZATION_EXECUTOR_HPP +#define GPU_NVIDIA_CUDNN_BATCH_NORMALIZATION_EXECUTOR_HPP + +#include "common/batch_normalization_pd.hpp" +#include "common/c_types_map.hpp" +#include "common/primitive.hpp" +#include "common/type_helpers.hpp" +#include "gpu/nvidia/cudnn_batch_normalization_impl.hpp" +#include "gpu/nvidia/sycl_cuda_engine.hpp" +#include "gpu/nvidia/sycl_cuda_scoped_context.hpp" +#include "gpu/nvidia/sycl_cuda_stream.hpp" +#include "gpu/nvidia/sycl_cuda_utils.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +struct bnorm_exec_base_t { + virtual status_t execute(const exec_ctx_t &ctx, engine_t *engine, + const std::shared_ptr + bnorm_impl) const = 0; + +protected: + template + void *mean_var_ptr(cl::sycl::accessor acc, sc_t &sc, + const cl::sycl::interop_handler &ih) const { + return sc.template memory(ih, acc); + } + + template + std::nullptr_t mean_var_ptr(std::nullptr_t acc, sc_t &, + const cl::sycl::interop_handler &ih) const { + return acc; + } + + template + void interop_task_fwd( + std::shared_ptr bnorm_impl, + engine_t *engine, cl::sycl::handler &cgh, + nvidia::sycl_cuda_stream_t *cuda_stream, read_acc_t src_acc, + write_acc_t dst_acc, maybe_nullptr_t mean_acc, + maybe_nullptr_t var_acc, float_acc_t scale_acc, + float_acc_t bias_acc, wkspace_st_t wkspace_st, bool init_ss, + bool init_mean_var) const { + + std::shared_ptr< + cl::sycl::accessor> + wkspace_acc; + if (!wkspace_st->is_null()) { + wkspace_acc.reset(new cl::sycl::accessor( + utils::downcast( + wkspace_st) + ->buffer() + .template get_access( + cgh))); + } + + maybe_init_mean_var(cuda_stream, mean_acc, var_acc, init_mean_var); + maybe_init_ss(cuda_stream, scale_acc, bias_acc, init_ss); + cgh.interop_task([=](const cl::sycl::interop_handler &ih) { + auto &sycl_engine = *utils::downcast(engine); + auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine); + auto handle = cuda_stream->get_cudnn_handle(); + + auto x = sc.memory(ih, src_acc); + auto y = sc.memory(ih, dst_acc); + auto mean = mean_var_ptr(mean_acc, sc, ih); + auto var = mean_var_ptr(var_acc, sc, ih); + auto scale = sc.memory(ih, scale_acc); + auto bias = sc.memory(ih, bias_acc) + bnorm_impl->C(); + uint8_t *y_prime = nullptr, *save_mean = nullptr, + *save_var = nullptr; + if (!wkspace_st->is_null()) { + save_mean = sc.memory(ih, *wkspace_acc); + save_var = save_mean + bnorm_impl->mean_var_size_bytes(); + y_prime = save_var + bnorm_impl->mean_var_size_bytes(); + } + + std::shared_ptr args(new bnorm_fwd_args_t(x, y, mean, + var, scale, bias, y_prime, save_mean, save_var)); + + bnorm_impl->execute(handle, args); + }); + } + + template + void interop_task_bwd( + std::shared_ptr bnorm_impl, + engine_t *engine, cl::sycl::handler &cgh, + nvidia::sycl_cuda_stream_t *cuda_stream, read_acc_t src_acc, + read_acc_t diff_dst_acc, write_acc_t diff_src_acc, + ss_acc_t scale_acc, ss_acc_t bias_acc, + d_ss_acc_t diff_scaleshift_acc, read_acc_t wkspace_acc, + std::shared_ptr> + temp_relu_output, + bool init_ss, bool init_mean_var) const { + + maybe_init_ss(cuda_stream, scale_acc, bias_acc, init_ss); + cgh.interop_task([=](const cl::sycl::interop_handler &ih) { + auto &sycl_engine = *utils::downcast(engine); + auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine); + auto handle = cuda_stream->get_cudnn_handle(); + + auto x = sc.memory(ih, src_acc); + auto dy = sc.memory(ih, diff_dst_acc); + auto dx = sc.memory(ih, diff_src_acc); + auto scale = sc.memory(ih, scale_acc); + auto bias = sc.memory(ih, bias_acc) + + (bnorm_impl->C() * sizeof(float)); + auto diff_scale = sc.memory(ih, diff_scaleshift_acc); + auto diff_bias = diff_scale + (bnorm_impl->C() * sizeof(float)); + auto save_mean = sc.memory(ih, wkspace_acc); + auto save_var = save_mean + bnorm_impl->mean_var_size_bytes(); + auto wkspace = save_var + bnorm_impl->mean_var_size_bytes(); + auto relu_dy = bnorm_impl->fuse_norm_relu() + ? sc.memory(ih, *temp_relu_output) + : nullptr; + + std::shared_ptr args( + new bnorm_bwd_args_t(x, dx, dy, save_mean, save_var, scale, + bias, diff_scale, diff_bias, wkspace, relu_dy)); + + bnorm_impl->execute(handle, args); + }); + } + + template + void maybe_init_ss( + nvidia::sycl_cuda_stream_t *cuda_stream, T, T, bool) const {} + + template + void maybe_init_ss(nvidia::sycl_cuda_stream_t *cuda_stream, + cl::sycl::accessor scale_acc, + cl::sycl::accessor bias_acc, + bool init_ss) const { + if (init_ss) { + constexpr T scale_val = 1, bias_val = 0; + cuda_stream->interop_task([&](cl::sycl::handler &cgh) { + cgh.fill(scale_acc, scale_val); + }); + + cuda_stream->interop_task([&](cl::sycl::handler &cgh) { + cgh.fill(bias_acc, bias_val); + }); + } + } + + // Handle the cases when mean and var are read-only accessors or nullptr + template + void maybe_init_mean_var( + nvidia::sycl_cuda_stream_t *cuda_stream, T, T, bool) const {} + + template + void maybe_init_mean_var(nvidia::sycl_cuda_stream_t *cuda_stream, + cl::sycl::accessor mean_acc, + cl::sycl::accessor var_acc, + bool init_mean_var) const { + if (init_mean_var) { + constexpr T mean_var_val = 0; + cuda_stream->interop_task([&](cl::sycl::handler &cgh) { + cgh.fill(mean_acc, mean_var_val); + }); + + cuda_stream->interop_task([&](cl::sycl::handler &cgh) { + cgh.fill(var_acc, mean_var_val); + }); + } + } +}; + +struct bnorm_exec_fwd_inf_t : public bnorm_exec_base_t { + status_t execute(const exec_ctx_t &ctx, engine_t *engine, + std::shared_ptr bnorm_impl) + const override { + nvidia::sycl_cuda_stream_t *cuda_stream + = utils::downcast(ctx.stream()); + + auto wkspace_storage = bnorm_impl->is_training() + ? ctx.output(DNNL_ARG_WORKSPACE)->memory_storage() + : &memory_storage_t::empty_storage(); + + auto n_channels = bnorm_impl->C(); + cl::sycl::buffer scaleshift_buff(n_channels * 2); + return cuda_stream->interop_task([&](cl::sycl::handler &cgh) { + auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC); + auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST); + auto scale_acc + = scaleshift_buff.get_access( + cgh, n_channels, 0); + auto bias_acc + = scaleshift_buff.get_access( + cgh, n_channels, n_channels); + bool init_ss = true, init_mean_var = false; + + interop_task_fwd(bnorm_impl, engine, cgh, cuda_stream, src_acc, + dst_acc, nullptr, nullptr, scale_acc, bias_acc, + wkspace_storage, init_ss, init_mean_var); + }); + } +}; + +struct bnorm_exec_fwd_inf_ss_t : public bnorm_exec_base_t { + status_t execute(const exec_ctx_t &ctx, engine_t *engine, + std::shared_ptr bnorm_impl) + const override { + nvidia::sycl_cuda_stream_t *cuda_stream + = utils::downcast(ctx.stream()); + + auto wkspace_storage = bnorm_impl->is_training() + ? ctx.output(DNNL_ARG_WORKSPACE)->memory_storage() + : &memory_storage_t::empty_storage(); + + auto scaleshift_buff + = utils::downcast( + &CTX_IN_STORAGE(DNNL_ARG_SCALE_SHIFT)) + ->buffer(); + auto n_channels = bnorm_impl->C(); + + return cuda_stream->interop_task([&](cl::sycl::handler &cgh) { + auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC); + auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST); + auto scale_acc + = scaleshift_buff.get_access( + cgh, n_channels, 0); + auto bias_acc + = scaleshift_buff.get_access( + cgh, n_channels, n_channels); + bool init_ss = false, init_mean_var = false; + + interop_task_fwd(bnorm_impl, engine, cgh, cuda_stream, src_acc, + dst_acc, nullptr, nullptr, scale_acc, bias_acc, + wkspace_storage, init_ss, init_mean_var); + }); + } +}; + +struct bnorm_exec_fwd_inf_stats_t : public bnorm_exec_base_t { + status_t execute(const exec_ctx_t &ctx, engine_t *engine, + std::shared_ptr bnorm_impl) + const override { + nvidia::sycl_cuda_stream_t *cuda_stream + = utils::downcast(ctx.stream()); + + auto wkspace_storage = bnorm_impl->is_training() + ? ctx.output(DNNL_ARG_WORKSPACE)->memory_storage() + : &memory_storage_t::empty_storage(); + + auto n_channels = bnorm_impl->C(); + cl::sycl::buffer scaleshift_buff(n_channels * 2); + return cuda_stream->interop_task([&](cl::sycl::handler &cgh) { + auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC); + auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST); + auto mean_acc = CTX_IN_ACCESSOR(DNNL_ARG_MEAN); + auto var_acc = CTX_IN_ACCESSOR(DNNL_ARG_VARIANCE); + auto scale_acc + = scaleshift_buff.get_access( + cgh, n_channels, 0); + auto bias_acc + = scaleshift_buff.get_access( + cgh, n_channels, n_channels); + bool init_ss = true, init_mean_var = false; + + interop_task_fwd(bnorm_impl, engine, cgh, cuda_stream, src_acc, + dst_acc, mean_acc, var_acc, scale_acc, bias_acc, + wkspace_storage, init_ss, init_mean_var); + }); + } +}; + +struct bnorm_exec_fwd_inf_ss_stats_t : public bnorm_exec_base_t { + status_t execute(const exec_ctx_t &ctx, engine_t *engine, + std::shared_ptr bnorm_impl) + const override { + nvidia::sycl_cuda_stream_t *cuda_stream + = utils::downcast(ctx.stream()); + + auto wkspace_storage = bnorm_impl->is_training() + ? ctx.output(DNNL_ARG_WORKSPACE)->memory_storage() + : &memory_storage_t::empty_storage(); + + auto scaleshift_buff + = utils::downcast( + &CTX_IN_STORAGE(DNNL_ARG_SCALE_SHIFT)) + ->buffer(); + auto n_channels = bnorm_impl->C(); + + return cuda_stream->interop_task([&](cl::sycl::handler &cgh) { + auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC); + auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST); + auto mean_acc = CTX_IN_ACCESSOR(DNNL_ARG_MEAN); + auto var_acc = CTX_IN_ACCESSOR(DNNL_ARG_VARIANCE); + auto scale_acc + = scaleshift_buff.get_access( + cgh, n_channels, 0); + auto bias_acc + = scaleshift_buff.get_access( + cgh, n_channels, n_channels); + bool init_ss = false, init_mean_var = false; + + interop_task_fwd(bnorm_impl, engine, cgh, cuda_stream, src_acc, + dst_acc, mean_acc, var_acc, scale_acc, bias_acc, + wkspace_storage, init_ss, init_mean_var); + }); + } +}; + +struct bnorm_exec_fwd_t : public bnorm_exec_base_t { + status_t execute(const exec_ctx_t &ctx, engine_t *engine, + std::shared_ptr bnorm_impl) + const override { + nvidia::sycl_cuda_stream_t *cuda_stream + = utils::downcast(ctx.stream()); + + auto wkspace_storage = bnorm_impl->is_training() + ? ctx.output(DNNL_ARG_WORKSPACE)->memory_storage() + : &memory_storage_t::empty_storage(); + + auto n_channels = bnorm_impl->C(); + + cl::sycl::buffer scaleshift_buff(n_channels * 2); + return cuda_stream->interop_task([&](cl::sycl::handler &cgh) { + auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC); + auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST); + auto mean_acc = CTX_OUT_ACCESSOR(DNNL_ARG_MEAN); + auto var_acc = CTX_OUT_ACCESSOR(DNNL_ARG_VARIANCE); + auto scale_acc + = scaleshift_buff.get_access( + cgh, n_channels, 0); + auto bias_acc + = scaleshift_buff.get_access( + cgh, n_channels, n_channels); + bool init_ss = true, init_mean_var = true; + + interop_task_fwd(bnorm_impl, engine, cgh, cuda_stream, src_acc, + dst_acc, mean_acc, var_acc, scale_acc, bias_acc, + wkspace_storage, init_ss, init_mean_var); + }); + } +}; + +struct bnorm_exec_fwd_ss_t : public bnorm_exec_base_t { + status_t execute(const exec_ctx_t &ctx, engine_t *engine, + std::shared_ptr bnorm_impl) + const override { + nvidia::sycl_cuda_stream_t *cuda_stream + = utils::downcast(ctx.stream()); + + auto wkspace_storage = bnorm_impl->is_training() + ? ctx.output(DNNL_ARG_WORKSPACE)->memory_storage() + : &memory_storage_t::empty_storage(); + + auto scaleshift_buff + = utils::downcast( + &CTX_IN_STORAGE(DNNL_ARG_SCALE_SHIFT)) + ->buffer(); + auto n_channels = bnorm_impl->C(); + + return cuda_stream->interop_task([&](cl::sycl::handler &cgh) { + auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC); + auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST); + auto mean_acc = CTX_OUT_ACCESSOR(DNNL_ARG_MEAN); + auto var_acc = CTX_OUT_ACCESSOR(DNNL_ARG_VARIANCE); + auto scale_acc + = scaleshift_buff.get_access( + cgh, n_channels, 0); + auto bias_acc + = scaleshift_buff.get_access( + cgh, n_channels, n_channels); + bool init_ss = false, init_mean_var = true; + + interop_task_fwd(bnorm_impl, engine, cgh, cuda_stream, src_acc, + dst_acc, mean_acc, var_acc, scale_acc, bias_acc, + wkspace_storage, init_ss, init_mean_var); + }); + } +}; + +struct bnorm_exec_bwd_t : public bnorm_exec_base_t { + status_t execute(const exec_ctx_t &ctx, engine_t *engine, + std::shared_ptr bnorm_impl) + const override { + nvidia::sycl_cuda_stream_t *cuda_stream + = utils::downcast(ctx.stream()); + + auto n_channels = bnorm_impl->C(); + cl::sycl::buffer scaleshift_buff(n_channels * 2); + cl::sycl::buffer diff_scaleshift_buff(n_channels * 2); + + return cuda_stream->interop_task([&](cl::sycl::handler &cgh) { + auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC); + auto diff_dst_acc = CTX_IN_ACCESSOR(DNNL_ARG_DIFF_DST); + auto diff_src_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_SRC); + auto wkspace_acc = CTX_IN_ACCESSOR(DNNL_ARG_WORKSPACE); + auto diff_scaleshift_acc + = diff_scaleshift_buff + .get_access(cgh); + auto scale_acc + = scaleshift_buff.get_access( + cgh, n_channels, 0); + auto bias_acc + = scaleshift_buff.get_access( + cgh, n_channels, n_channels); + bool init_ss = true, init_mean_var = false; + + std::shared_ptr> + temp_relu_output = nullptr; + if (bnorm_impl->fuse_norm_relu()) { + temp_relu_output = std::make_shared>( + CTX_SCRATCH_ACCESSOR(memory_tracking::names::key_none)); + } + + interop_task_bwd(bnorm_impl, engine, cgh, cuda_stream, src_acc, + diff_dst_acc, diff_src_acc, scale_acc, bias_acc, + diff_scaleshift_acc, wkspace_acc, temp_relu_output, init_ss, + init_mean_var); + }); + } +}; + +struct bnorm_exec_bwd_dw_ss_t : public bnorm_exec_base_t { + status_t execute(const exec_ctx_t &ctx, engine_t *engine, + std::shared_ptr bnorm_impl) + const override { + nvidia::sycl_cuda_stream_t *cuda_stream + = utils::downcast(ctx.stream()); + + auto scaleshift_buff + = utils::downcast( + &CTX_IN_STORAGE(DNNL_ARG_SCALE_SHIFT)) + ->buffer(); + + auto n_channels = bnorm_impl->C(); + + return cuda_stream->interop_task([&](cl::sycl::handler &cgh) { + auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC); + auto diff_dst_acc = CTX_IN_ACCESSOR(DNNL_ARG_DIFF_DST); + auto diff_src_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_SRC); + auto diff_scaleshift_acc + = CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_SCALE_SHIFT); + auto scale_acc + = scaleshift_buff.get_access( + cgh, n_channels, 0); + auto bias_acc + = scaleshift_buff.get_access( + cgh, n_channels, n_channels); + auto wkspace_acc = CTX_IN_ACCESSOR(DNNL_ARG_WORKSPACE); + bool init_ss = false, init_mean_var = false; + + std::shared_ptr> + temp_relu_output = nullptr; + if (bnorm_impl->fuse_norm_relu()) { + temp_relu_output = std::make_shared>( + CTX_SCRATCH_ACCESSOR(memory_tracking::names::key_none)); + } + + interop_task_bwd(bnorm_impl, engine, cgh, cuda_stream, src_acc, + diff_dst_acc, diff_src_acc, scale_acc, bias_acc, + diff_scaleshift_acc, wkspace_acc, temp_relu_output, init_ss, + init_mean_var); + }); + } +}; + +struct bnorm_exec_bwd_d_ss_t : public bnorm_exec_base_t { + status_t execute(const exec_ctx_t &ctx, engine_t *engine, + std::shared_ptr bnorm_impl) + const override { + nvidia::sycl_cuda_stream_t *cuda_stream + = utils::downcast(ctx.stream()); + + auto scaleshift_buff + = utils::downcast( + &CTX_IN_STORAGE(DNNL_ARG_SCALE_SHIFT)) + ->buffer(); + auto n_channels = bnorm_impl->C(); + + cl::sycl::buffer diff_scaleshift_buff(n_channels * 2); + return cuda_stream->interop_task([&](cl::sycl::handler &cgh) { + auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC); + auto diff_dst_acc = CTX_IN_ACCESSOR(DNNL_ARG_DIFF_DST); + auto diff_src_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_SRC); + auto scale_acc + = scaleshift_buff.get_access( + cgh, n_channels, 0); + auto bias_acc + = scaleshift_buff.get_access( + cgh, n_channels, n_channels); + auto diff_scaleshift_acc + = diff_scaleshift_buff + .get_access(cgh); + auto wkspace_acc = CTX_IN_ACCESSOR(DNNL_ARG_WORKSPACE); + bool init_ss = false, init_mean_var = false; + + std::shared_ptr> + temp_relu_output = nullptr; + if (bnorm_impl->fuse_norm_relu()) { + temp_relu_output = std::make_shared>( + CTX_SCRATCH_ACCESSOR(memory_tracking::names::key_none)); + } + + interop_task_bwd(bnorm_impl, engine, cgh, cuda_stream, src_acc, + diff_dst_acc, diff_src_acc, scale_acc, bias_acc, + diff_scaleshift_acc, wkspace_acc, temp_relu_output, init_ss, + init_mean_var); + }); + } +}; + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/nvidia/cudnn_batch_normalization_impl.hpp b/src/gpu/nvidia/cudnn_batch_normalization_impl.hpp new file mode 100644 index 00000000000..30f9d77ceb1 --- /dev/null +++ b/src/gpu/nvidia/cudnn_batch_normalization_impl.hpp @@ -0,0 +1,347 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_NVIDIA_CUDNN_BATCH_NORMALIZATION_IMPL_HPP +#define GPU_NVIDIA_CUDNN_BATCH_NORMALIZATION_IMPL_HPP + +#include + +#include "gpu/nvidia/sycl_cuda_utils.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +struct bnorm_args_t { +public: + bnorm_args_t(void *x, void *mean, void *var, void *scale, void *bias) + : x_(x), mean_(mean), var_(var), scale_(scale), bias_(bias) {} + + void *x_, *mean_, *var_, *scale_, *bias_; +}; + +struct bnorm_fwd_args_t : public bnorm_args_t { + bnorm_fwd_args_t(void *x, void *y, void *mean, void *var, void *scale, + void *bias, void *y_prime, void *save_mean, void *save_var) + : bnorm_args_t::bnorm_args_t(x, mean, var, scale, bias) + , y_(y) + , y_prime_(y_prime) + , save_mean_(save_mean) + , save_var_(save_var) {} + + void *y_, *y_prime_, *save_mean_, *save_var_; +}; + +struct bnorm_bwd_args_t : public bnorm_args_t { + bnorm_bwd_args_t(void *x, void *dx, void *dy, void *mean, void *var, + void *scale, void *bias, void *diff_scale, void *diff_bias, + void *wkspace, void *relu_dx) + : bnorm_args_t(x, mean, var, scale, bias) + , dx_(dx) + , dy_(dy) + , diff_scale_(diff_scale) + , diff_bias_(diff_bias) + , wkspace_(wkspace) + , relu_dx_(relu_dx) {} + + void *dx_, *dy_, *diff_scale_, *diff_bias_, *wkspace_, *relu_dx_; +}; + +struct cudnn_batch_normalization_impl_base_t { + virtual ~cudnn_batch_normalization_impl_base_t() { + for (size_t i = 0; i < NUM_IO; ++i) { + if (tensor_descs_[i]) { + CUDNN_EXECUTE_FUNC_V( + cudnnDestroyTensorDescriptor, tensor_descs_[i]); + } + } + + if ((fuse_norm_relu_ || with_relu_postop_) && act_desc_) { + CUDNN_EXECUTE_FUNC_V(cudnnDestroyActivationDescriptor, act_desc_); + } + } + + virtual status_t init(batch_normalization_pd_t *pd) = 0; + + virtual void execute( + cudnnHandle_t handle, std::shared_ptr args) const = 0; + + bool is_bwd_d() const { return is_bwd_data_; } + bool is_training() const { return is_training_; } + bool fuse_norm_relu() const { return fuse_norm_relu_; } + std::size_t dt_size() const { return dt_size_; } + std::size_t mean_var_size_bytes() { return mean_var_size_bytes_; } + uint8_t default_mean_var() const { return 0; } + int C() const { return nchannels_; } + +protected: + status_t init_common(batch_normalization_pd_t *pd) { + ndims_ = pd->ndims() < 4 ? 4 : pd->ndims(); + if (ndims_ > 5) { return status::invalid_arguments; } + + memory_desc_wrapper wrap(pd->src_md()); + fuse_norm_relu_ = pd->fuse_norm_relu(); + is_training_ = pd->is_training(); + with_global_stats_ = pd->use_global_stats(); + is_bwd_data_ = pd->desc()->prop_kind == prop_kind::backward_data; + dt_size_ = types::data_type_size(wrap.data_type()); + nchannels_ = pd->C(); + mean_var_size_bytes_ = nchannels_ * dt_size_; + eps_ = pd->desc()->batch_norm_epsilon; + y_prime_size_ = wrap.nelems() * dt_size_; + with_relu_postop_ = pd->with_relu_post_op(); + + auto n = static_cast(pd->MB() * pd->D() * pd->H() * pd->W()); + var_scaling_factor_ = (n - 1.f) / n; + + convert_dims(pd->src_md()->padded_dims, dims_[src], pd->ndims()); + convert_dims(pd->src_md()->format_desc.blocking.strides, strides_[src], + pd->ndims()); + + CHECK(convert_data_type(pd->src_md(), &data_types_[src])); + + CHECK(create_and_set_tensor_descriptor(&tensor_descs_[src], + data_types_[src], ndims_, dims_[src], strides_[src])); + CHECK(create_and_set_scaleshift_desc()); + if (fuse_norm_relu_ || with_relu_postop_) { + CHECK(create_and_set_activation_desc()); + } + + return status::success; + } + + virtual status_t create_and_set_scaleshift_desc() { + CHECK(CUDNN_EXECUTE_FUNC_S( + cudnnCreateTensorDescriptor, &tensor_descs_[scl])); + + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnDeriveBNTensorDescriptor, + tensor_descs_[scl], tensor_descs_[src], mode_)); + + return status::success; + } + + virtual status_t create_and_set_activation_desc() { + CHECK(CUDNN_EXECUTE_FUNC_S( + cudnnCreateActivationDescriptor, &act_desc_)); + + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetActivationDescriptor, act_desc_, + CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, relu_coef_)); + + return status::success; + } + + virtual status_t to_population_variance( + cudnnHandle_t handle, void *var) const { + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnScaleTensor, handle, tensor_descs_[scl], + var, &var_scaling_factor_)); + + return status::success; + } + + enum io { src = 0, dst, scl, NUM_IO }; + cudnnDataType_t data_types_[NUM_IO]; + cudnnTensorDescriptor_t tensor_descs_[NUM_IO] = {}; + cudnnActivationDescriptor_t act_desc_; + cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL; + int dims_[NUM_IO][DNNL_MAX_NDIMS]; + int strides_[NUM_IO][DNNL_MAX_NDIMS]; + int ndims_, nchannels_; + float alpha_ = 1.f, beta = 0.f; + double relu_coef_ = 0.0; + double factor_ = 1.0; + double eps_ = CUDNN_BN_MIN_EPSILON; + float var_scaling_factor_ = 0.f; + bool fuse_norm_relu_ = false; + bool with_relu_postop_ = false; + bool with_global_stats_ = false; + bool is_training_ = false; + bool is_bwd_data_ = false; + std::size_t y_prime_size_; + std::size_t dt_size_, mean_var_size_bytes_; +}; + +struct cudnn_batch_normalization_fwd_impl_t + : public cudnn_batch_normalization_impl_base_t { + using cudnn_batch_normalization_impl_base_t:: + cudnn_batch_normalization_impl_base_t; + + status_t init(batch_normalization_pd_t *pd) override { + init_common(pd); + + convert_dims(pd->dst_md()->padded_dims, dims_[dst], pd->ndims()); + convert_dims(pd->dst_md()->format_desc.blocking.strides, strides_[dst], + pd->ndims()); + + CHECK(convert_data_type(pd->dst_md(), &data_types_[dst])); + + CHECK(create_and_set_tensor_descriptor(&tensor_descs_[dst], + data_types_[dst], ndims_, dims_[dst], strides_[dst])); + + return status::success; + } + + void execute(cudnnHandle_t handle, + std::shared_ptr args) const override { + auto fwd_args = static_cast(args.get()); + + CUDNN_EXECUTE_FUNC(cudnnBatchNormalizationForwardTraining, handle, + mode_, &alpha_, &beta, tensor_descs_[src], fwd_args->x_, + tensor_descs_[dst], fwd_args->y_, tensor_descs_[scl], + fwd_args->scale_, fwd_args->bias_, factor_, fwd_args->mean_, + fwd_args->var_, eps_, fwd_args->save_mean_, + fwd_args->save_var_); + + if (is_training_) { to_population_variance(handle, fwd_args->var_); } + + if (fuse_norm_relu_ || with_relu_postop_) { do_relu(handle, fwd_args); } + } + +protected: + void do_relu(cudnnHandle_t handle, bnorm_fwd_args_t *fwd_args) const { + if (is_training_ && fuse_norm_relu_) { + // Copy the result to the workspace + CUDNN_EXECUTE_FUNC(cudnnAddTensor, handle, &alpha_, + tensor_descs_[dst], fwd_args->y_, &beta, tensor_descs_[dst], + fwd_args->y_prime_); + } + + CUDNN_EXECUTE_FUNC(cudnnActivationForward, handle, act_desc_, &alpha_, + tensor_descs_[dst], fwd_args->y_, &beta, tensor_descs_[dst], + fwd_args->y_); + } +}; + +struct cudnn_batch_normalization_fwd_stats_impl_t + : public cudnn_batch_normalization_fwd_impl_t { + + status_t init(batch_normalization_pd_t *pd) override { + return cudnn_batch_normalization_fwd_impl_t::init(pd); + } + + void execute(cudnnHandle_t handle, + std::shared_ptr args) const override { + auto fwd_args = static_cast(args.get()); + + CUDNN_EXECUTE_FUNC(cudnnBatchNormalizationForwardInference, handle, + mode_, &alpha_, &beta, tensor_descs_[src], fwd_args->x_, + tensor_descs_[dst], fwd_args->y_, tensor_descs_[scl], + fwd_args->scale_, fwd_args->bias_, fwd_args->mean_, + fwd_args->var_, eps_); + + if (fuse_norm_relu_ || with_relu_postop_) { do_relu(handle, fwd_args); } + } +}; + +struct cudnn_batch_normalization_bwd_impl_t + : public cudnn_batch_normalization_impl_base_t { + + status_t init(batch_normalization_pd_t *pd) override { + init_common(pd); + + convert_dims(pd->diff_src_md()->padded_dims, diff_dims_[diff_src], + pd->ndims()); + convert_dims(pd->diff_dst_md()->padded_dims, diff_dims_[diff_dst], + pd->ndims()); + + convert_dims(pd->diff_src_md()->format_desc.blocking.strides, + strides_[diff_src], pd->ndims()); + convert_dims(pd->diff_dst_md()->format_desc.blocking.strides, + strides_[diff_dst], pd->ndims()); + + CHECK(convert_data_type( + pd->diff_src_md(), &diff_data_types_[diff_src])); + CHECK(convert_data_type( + pd->diff_dst_md(), &diff_data_types_[diff_dst])); + + CHECK(create_and_set_tensor_descriptor(&diff_tensor_descs_[diff_src], + data_types_[diff_src], ndims_, diff_dims_[diff_src], + strides_[diff_src])); + CHECK(create_and_set_tensor_descriptor(&diff_tensor_descs_[diff_dst], + data_types_[diff_dst], ndims_, diff_dims_[diff_dst], + strides_[diff_dst])); + + return status::success; + } + + void execute(cudnnHandle_t handle, + std::shared_ptr args) const override { + auto bwd_args = static_cast(args.get()); + + CUDNN_EXECUTE_FUNC(cudnnBatchNormalizationBackward, handle, mode_, + &a_data_diff_, &b_data_diff_, &a_param_diff_, &b_param_diff_, + tensor_descs_[src], bwd_args->x_, diff_tensor_descs_[diff_dst], + bwd_args->dy_, diff_tensor_descs_[diff_src], bwd_args->dx_, + tensor_descs_[scl], bwd_args->scale_, bwd_args->diff_scale_, + bwd_args->diff_bias_, eps_, bwd_args->mean_, bwd_args->var_); + } + + ~cudnn_batch_normalization_bwd_impl_t() { + for (size_t i = 0; i < NUM_DIFF; i++) { + if (diff_tensor_descs_[i]) { + CUDNN_EXECUTE_FUNC_V( + cudnnDestroyTensorDescriptor, diff_tensor_descs_[i]); + } + } + } + +protected: + const float a_data_diff_ = 1.f, b_data_diff_ = 0.f; + const float a_param_diff_ = 1.f, b_param_diff_ = 0.f; + + enum diff_tensors { diff_src = 0, diff_dst, NUM_DIFF }; + int diff_dims_[NUM_DIFF][DNNL_MAX_NDIMS]; + cudnnTensorDescriptor_t diff_tensor_descs_[NUM_DIFF] = {}; + cudnnDataType_t diff_data_types_[NUM_DIFF]; +}; + +struct cudnn_batch_normalization_bwd_relu_impl_t + : public cudnn_batch_normalization_bwd_impl_t { + + status_t init(batch_normalization_pd_t *pd) override { + pd->scratchpad_registry().registrar().book( + memory_tracking::names::key_none, + memory_desc_wrapper(pd->diff_dst_md()).size(), size_t(1)); + + return cudnn_batch_normalization_bwd_impl_t::init(pd); + } + + void execute(cudnnHandle_t handle, + std::shared_ptr args) const override { + auto bwd_args = static_cast(args.get()); + + CUDNN_EXECUTE_FUNC(cudnnActivationBackward, handle, act_desc_, &alpha_, + diff_tensor_descs_[dst], bwd_args->wkspace_, + diff_tensor_descs_[dst], bwd_args->dy_, diff_tensor_descs_[dst], + bwd_args->wkspace_, &beta, diff_tensor_descs_[dst], + bwd_args->relu_dx_); + + CUDNN_EXECUTE_FUNC(cudnnBatchNormalizationBackward, handle, mode_, + &a_data_diff_, &b_data_diff_, &a_param_diff_, &b_param_diff_, + tensor_descs_[src], bwd_args->x_, diff_tensor_descs_[dst], + bwd_args->relu_dx_, diff_tensor_descs_[src], bwd_args->dx_, + tensor_descs_[scl], bwd_args->scale_, bwd_args->diff_scale_, + bwd_args->diff_bias_, eps_, bwd_args->mean_, bwd_args->var_); + } +}; + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/nvidia/cudnn_binary.cpp b/src/gpu/nvidia/cudnn_binary.cpp new file mode 100644 index 00000000000..c2001633fa7 --- /dev/null +++ b/src/gpu/nvidia/cudnn_binary.cpp @@ -0,0 +1,58 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "gpu/nvidia/cudnn_binary.hpp" +#include "gpu/nvidia/sycl_cuda_scoped_context.hpp" +#include "gpu/nvidia/sycl_cuda_stream.hpp" +#include "sycl/sycl_buffer_memory_storage.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +status_t cudnn_binary_t::execute(const exec_ctx_t &ctx) const { + if (memory_desc_wrapper(pd()->src_md(0)).has_zero_dim()) + return status::success; + + nvidia::sycl_cuda_stream_t *cuda_stream + = utils::downcast(ctx.stream()); + + return cuda_stream->interop_task([&](cl::sycl::handler &cgh) { + auto src_0_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC_0); + auto src_1_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC_1); + auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST); + + cgh.interop_task([=](const cl::sycl::interop_handler &ih) { + auto &sycl_engine = *utils::downcast( + cuda_stream->engine()); + auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine); + auto handle = cuda_stream->get_cudnn_handle(); + + auto a = sc.memory(ih, src_0_acc); + auto b = sc.memory(ih, src_1_acc); + auto c = sc.memory(ih, dst_acc); + + pd()->binary_impl_->execute(handle, a, b, c); + }); + }); +} + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl diff --git a/src/gpu/nvidia/cudnn_binary.hpp b/src/gpu/nvidia/cudnn_binary.hpp new file mode 100644 index 00000000000..e576e763c87 --- /dev/null +++ b/src/gpu/nvidia/cudnn_binary.hpp @@ -0,0 +1,125 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_NVIDIA_CUDNN_BINARY_HPP +#define GPU_NVIDIA_CUDNN_BINARY_HPP + +#include "cudnn.h" + +#include + +#include "common/binary_pd.hpp" +#include "common/c_types_map.hpp" +#include "common/primitive.hpp" +#include "gpu/nvidia/cudnn_binary_impl.hpp" +#include "gpu/nvidia/sycl_cuda_engine.hpp" +#include "gpu/nvidia/sycl_cuda_utils.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +struct cudnn_binary_t : public primitive_t { + + struct pd_t : public binary_pd_t { + using binary_pd_t::binary_pd_t; + + DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_binary_t); + + status_t init(engine_t *) { + using namespace data_type; + + bool ok = (set_default_params() == status::success) + && check_data_types() && check_no_blocking() + && IMPLICATION( + utils::one_of(src_md(0)->data_type, f32, f16), + attr()->has_default_values()) + && IMPLICATION(utils::one_of(src_md(0)->data_type, s8), + attr()->has_default_values( + primitive_attr_t::skip_mask_t::scales)) + && IMPLICATION(!attr()->scales_.has_default_values(), + check_scales_mask()); + + if (!ok) return status::unimplemented; + + if (check_for_zero_dims()) return status::success; + + binary_impl_.reset(new cudnn_binary_impl_t()); + + return binary_impl_->init(this); + } + + bool check_for_zero_dims() const { + return has_zero_dims(src_md(0)->dims, src_md(0)->ndims) + || has_zero_dims(src_md(1)->dims, src_md(1)->ndims) + || has_zero_dims(dst_md()->dims, dst_md()->ndims); + } + + bool check_scales_mask() const { + for (const auto &s : attr()->scales_.scales_) { + if (s.second.mask_ != 0) return false; + } + return true; + } + + bool check_no_blocking() const { + // Blocking is not supported by cudnnOpTensor, return false if any + // blocks are present + return src_md(0)->format_desc.blocking.inner_nblks + + src_md(1)->format_desc.blocking.inner_nblks + + dst_md()->format_desc.blocking.inner_nblks + == 0; + } + + bool check_data_types() const { + using namespace data_type; + bool inputs_same = src_md(0)->data_type == src_md(1)->data_type; + dnnl_data_type_t input_type = src_md(0)->data_type; + dnnl_data_type_t output_type = dst_md()->data_type; + + switch (output_type) { + case f32: + return inputs_same + && (input_type == f32 || input_type == s8 + || input_type == f16); + case f16: + return inputs_same + && (input_type == f32 || input_type == f16); + case s8: + return inputs_same + && (input_type == f32 || input_type == s8); + } + return false; + } + std::shared_ptr binary_impl_; + }; + + cudnn_binary_t(const pd_t *apd) : primitive_t(apd) {} + + status_t execute(const exec_ctx_t &ctx) const override; + +private: + const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } +}; + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/nvidia/cudnn_binary_impl.hpp b/src/gpu/nvidia/cudnn_binary_impl.hpp new file mode 100644 index 00000000000..6e348ef1e9d --- /dev/null +++ b/src/gpu/nvidia/cudnn_binary_impl.hpp @@ -0,0 +1,143 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_NVIDIA_CUDNN_BINARY_IMPL_HPP +#define GPU_NVIDIA_CUDNN_BINARY_IMPL_HPP + +#include "cudnn.h" + +#include "gpu/nvidia/sycl_cuda_utils.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +struct cudnn_binary_impl_base_t { + enum io { src_0 = 0, src_1, dst_0, NUM_IO }; + cudnnDataType_t data_types[NUM_IO]; + int ndims; + int dims[NUM_IO][DNNL_MAX_NDIMS]; + cudnnOpTensorDescriptor_t op_desc = nullptr; + cudnnTensorDescriptor_t tensor_descs[NUM_IO] = {}; + cudnnOpTensorOp_t alg_kind; + float alpha[2]; + float beta = 0.0f; + + virtual ~cudnn_binary_impl_base_t() { + if (op_desc) { + CUDNN_EXECUTE_FUNC_V(cudnnDestroyOpTensorDescriptor, op_desc); + } + for (size_t i = 0; i < NUM_IO; i++) { + if (tensor_descs[i]) { + CUDNN_EXECUTE_FUNC_V( + cudnnDestroyTensorDescriptor, tensor_descs[i]); + } + } + } + + virtual status_t init(const binary_pd_t *pd) = 0; + + void execute(cudnnHandle_t handle, void *a, void *b, void *c) const { + CUDNN_EXECUTE_FUNC(cudnnOpTensor, handle, op_desc, &alpha[0], + tensor_descs[src_0], a, &alpha[1], tensor_descs[src_1], b, + &beta, tensor_descs[dst_0], c); + } + + virtual status_t create_and_set_op_descriptor() { + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnCreateOpTensorDescriptor, &op_desc)); + + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetOpTensorDescriptor, op_desc, + alg_kind, cudnnDataType_t::CUDNN_DATA_FLOAT, + cudnnNanPropagation_t::CUDNN_NOT_PROPAGATE_NAN)); + + return status::success; + } + + status_t convert_alg_kind( + alg_kind_t alg_kind, cudnnOpTensorOp_t *cuda_alg_kind) const { + switch (alg_kind) { + case alg_kind::binary_add: + *cuda_alg_kind = cudnnOpTensorOp_t::CUDNN_OP_TENSOR_ADD; + break; + case alg_kind::binary_mul: + *cuda_alg_kind = cudnnOpTensorOp_t::CUDNN_OP_TENSOR_MUL; + break; + case alg_kind::binary_min: + *cuda_alg_kind = cudnnOpTensorOp_t::CUDNN_OP_TENSOR_MIN; + break; + case alg_kind::binary_max: + *cuda_alg_kind = cudnnOpTensorOp_t::CUDNN_OP_TENSOR_MAX; + break; + default: return status::unimplemented; + } + return status::success; + } +}; + +struct cudnn_binary_impl_t : public cudnn_binary_impl_base_t { + int strides[NUM_IO][DNNL_MAX_NDIMS]; + + status_t init(const binary_pd_t *pd) override { + // If any of the dimensions are 0 we should not continue with creating + // cudnn descriptors + if (has_zero_dims(pd->src_md(0)->dims, pd->ndims())) { + return status::success; + } + if (pd->ndims() > CUDNN_DIM_MAX) { return status::invalid_arguments; } + ndims = pd->ndims() < 4 ? 4 : pd->ndims(); + convert_dims(pd->src_md(0)->padded_dims, dims[src_0], pd->ndims()); + convert_dims(pd->src_md(1)->padded_dims, dims[src_1], pd->ndims()); + convert_dims(pd->dst_md()->padded_dims, dims[dst_0], pd->ndims()); + + convert_dims(pd->src_md(0)->format_desc.blocking.strides, + strides[src_0], pd->ndims()); + convert_dims(pd->src_md(1)->format_desc.blocking.strides, + strides[src_1], pd->ndims()); + convert_dims(pd->dst_md()->format_desc.blocking.strides, strides[dst_0], + pd->ndims()); + alg_kind_t alg = pd->desc()->alg_kind; + auto alg_ok = convert_alg_kind(alg, &alg_kind); + if (alg_ok != status::success) { return status::unimplemented; } + + CHECK(convert_data_type(pd->src_md(0), &data_types[src_0])); + CHECK(convert_data_type(pd->src_md(1), &data_types[src_1])); + CHECK(convert_data_type(pd->dst_md(), &data_types[dst_0])); + + bool do_scaling = pd->src_md(0)->data_type == dnnl_data_type_t::dnnl_s8; + auto scales_0 = pd->attr()->scales_.get(1).scales_; + auto scales_1 = pd->attr()->scales_.get(2).scales_; + alpha[0] = do_scaling ? scales_0[0] : 1.0f; + alpha[1] = do_scaling ? scales_1[0] : 1.0f; + + CHECK(create_and_set_tensor_descriptor(&tensor_descs[src_0], + data_types[src_0], ndims, dims[src_0], strides[src_0])); + CHECK(create_and_set_tensor_descriptor(&tensor_descs[src_1], + data_types[src_1], ndims, dims[src_1], strides[src_1])); + CHECK(create_and_set_tensor_descriptor(&tensor_descs[dst_0], + data_types[dst_0], ndims, dims[dst_0], strides[dst_0])); + CHECK(create_and_set_op_descriptor()); + return status::success; + } +}; + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/nvidia/cudnn_concat.cpp b/src/gpu/nvidia/cudnn_concat.cpp new file mode 100644 index 00000000000..bf19b548db0 --- /dev/null +++ b/src/gpu/nvidia/cudnn_concat.cpp @@ -0,0 +1,42 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "gpu/nvidia/sycl_cuda_engine.hpp" +#include "gpu/ocl/ref_concat.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +namespace { + +using cpd_create_f = dnnl::impl::engine_t::concat_primitive_desc_create_f; + +const cpd_create_f cuda_concat_impl_list[] + = {gpu::ocl::ref_concat_t::pd_t::create, nullptr}; +} // namespace + +const cpd_create_f * +cuda_gpu_engine_impl_list_t::get_concat_implementation_list() { + return cuda_concat_impl_list; +} + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl diff --git a/src/gpu/nvidia/cudnn_conv_filter_adjustment_base.hpp b/src/gpu/nvidia/cudnn_conv_filter_adjustment_base.hpp new file mode 100644 index 00000000000..74bf281ce88 --- /dev/null +++ b/src/gpu/nvidia/cudnn_conv_filter_adjustment_base.hpp @@ -0,0 +1,169 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_NVIDIA_CUDNN_CONV_FILTER_ADJUSTMENT_BASE_HPP +#define GPU_NVIDIA_CUDNN_CONV_FILTER_ADJUSTMENT_BASE_HPP + +#include "cublas_v2.h" +#include "cudnn.h" + +#include "common/type_helpers.hpp" +#include "gpu/nvidia/sycl_cuda_engine.hpp" +#include "gpu/nvidia/sycl_cuda_utils.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +struct cudnn_conv_filter_adjustment_base_t { +public: + float filter_alpha_ = 1, filter_beta_ = 0; + cudnnTensorDescriptor_t current_filter_desc_, transform_filter_desc_; + // for filter in convolution, cuDNN only support nchw and nhwc. + // the hwio and dhwio is not supported and should be converted + // to either of the above format. + virtual bool supported_filter_format(const memory_desc_t *md) { + const memory_desc_wrapper mem_wrapper(md); + /// NOTE: the transformation for oidhw to oihwd is disabled until cuDNN + // fixes the the current bug for oihwd format. the transformation for + // odhwi to ohwdi has been disabled until cuDNN provides support for + // 3d convolution in ohwdi format. + return (!(mem_wrapper.matches_one_of_tag(/*format_tag::oidhw,*/ + /*format_tag::odhwi,*/ format_tag::dhwio, format_tag::hwio))); + } + + virtual ~cudnn_conv_filter_adjustment_base_t() { + if (current_filter_desc_) { + CUDNN_EXECUTE_FUNC_V( + cudnnDestroyTensorDescriptor, current_filter_desc_); + } + if (transform_filter_desc_) { + CUDNN_EXECUTE_FUNC_V( + cudnnDestroyTensorDescriptor, transform_filter_desc_); + } + } + + void propagate_strides(int *strides, const int *dims, + std::initializer_list perm) const { + int prev_p = -1; + for (auto p : perm) { + strides[p] = prev_p == -1 ? 1 : strides[prev_p] * dims[prev_p]; + prev_p = p; + } + } + + virtual status_t init_filter_transformation( + cudnnDataType_t filter_data_types, int filter_ndims, + int *filter_dims, int *current_filter_strides, + int *transform_filter_strides) { + // Set a descriptor for the current filter. + CHECK(create_and_set_tensor_descriptor(¤t_filter_desc_, + filter_data_types, filter_ndims, filter_dims, + current_filter_strides)); + // Set a descriptor for the transform filter. + CHECK(create_and_set_tensor_descriptor(&transform_filter_desc_, + filter_data_types, filter_ndims, filter_dims, + transform_filter_strides)); + return status::success; + } + + virtual void set_filter_nchw( + int filter_ndims, int *transform_filter_strides, int *filter_dims) { + switch (filter_ndims) { + case 4: // Convert to KCRS + return propagate_strides( + transform_filter_strides, filter_dims, {3, 2, 1, 0}); + case 5: + /// NOTE: cuDNN claims the filter must be in kcrsd . However + // in the current version(7.6.5) it accepts kcdrs filter is the + // same as ncdhw tensor. So according to cuDNN code should + // looks like: + // propagate_strides( + // transform_filter_strides, filter_dims, {2, 4, 3, 1, 0}); + // However, executing the code shows that they actually expect + // the filter format to be kcdrs. Therefore, we convert the + // filter to kcdrs instead: + // propagate_strides( + // transform_filter_strides, filter_dims, {4, 3, 2, 1, 0}); + + return propagate_strides( + transform_filter_strides, filter_dims, {4, 3, 2, 1, 0}); + case 6: + return propagate_strides(transform_filter_strides, filter_dims, + {5, 4, 3, 2, 1, 0}); + } + } + virtual void set_filter_nhwc( + int filter_ndims, int *transform_filter_strides, int *filter_dims) { + switch (filter_ndims) { + case 4: // Convert to krsc + return propagate_strides( + transform_filter_strides, filter_dims, {1, 3, 2, 0}); + case 5: + /// NOTE: Convert to krsdc. There is no support for krsdc and + // 3d convolution in the current version. So we convert the + // filter to ndhwc and then fold the dhwc for both srd and + // filter to make it a 4d conv. So according to cuDNN code + // should looks like: + // propagate_strides( + // transform_filter_strides, filter_dims, {1, 2, 4, 3, + // 0}); + // However, executing the code shows that they actually expect + // the filter format to be kdrsc. Therefore, we convert the + // filter to kdrsc: + // propagate_strides( + // transform_filter_strides, filter_dims, {1, 4, 3, 2, 0}); + + return propagate_strides( + transform_filter_strides, filter_dims, {1, 4, 3, 2, 0}); + case 6: + return propagate_strides(transform_filter_strides, filter_dims, + {1, 5, 4, 3, 2, 0}); + } + } + + void set_filter_format(int filter_ndims, int *filter_dims, + int *transform_filter_strides, cudnnTensorFormat_t format) { + if (format == CUDNN_TENSOR_NCHW) { + set_filter_nchw( + filter_ndims, transform_filter_strides, filter_dims); + } else { + set_filter_nhwc( + filter_ndims, transform_filter_strides, filter_dims); + } + } + void transform_filter(cudnnHandle_t handle, void *current_filter, + void *transform_filter) const { + CUDNN_EXECUTE_FUNC(cudnnTransformTensor, handle, &filter_alpha_, + current_filter_desc_, current_filter, &filter_beta_, + transform_filter_desc_, transform_filter); + } + void undo_transform_filter(cudnnHandle_t handle, void *transform_filter, + void *current_filter) const { + CUDNN_EXECUTE_FUNC(cudnnTransformTensor, handle, &filter_alpha_, + transform_filter_desc_, transform_filter, &filter_beta_, + current_filter_desc_, current_filter); + } +}; + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/nvidia/cudnn_conv_inner_product.hpp b/src/gpu/nvidia/cudnn_conv_inner_product.hpp new file mode 100644 index 00000000000..57fddb02f42 --- /dev/null +++ b/src/gpu/nvidia/cudnn_conv_inner_product.hpp @@ -0,0 +1,396 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_NVIDIA_CUDNN_CONV_INNER_PRODUCT_HPP +#define GPU_NVIDIA_CUDNN_CONV_INNER_PRODUCT_HPP + +#include "cudnn.h" + +#include + +#include "common/c_types_map.hpp" +#include "common/inner_product_pd.hpp" +#include "common/primitive.hpp" +#include "gpu/nvidia/cudnn_conv_inner_product_impl.hpp" +#include "gpu/nvidia/cudnn_inner_product.hpp" +#include "gpu/nvidia/sycl_cuda_engine.hpp" +#include "gpu/nvidia/sycl_cuda_utils.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { +namespace { +inline status_t init_mem_by_tag(format_tag_t tag, memory_desc_t &md) { + if (tag == format_tag::undef) { return status::unimplemented; } + CHECK(memory_desc_init_by_tag(md, tag)); + return status::success; +} + +inline format_tag_t get_tag(const memory_desc_t &md) { + using namespace format_tag; + auto tag = memory_desc_matches_one_of_tag(md, ab, abc, abcd, + abcde, // NCHW derivatives + ba, bca, bcda, bcdea, cba, cdba, + cdeba, // IO and spatial derivatives + acb, acdb, acdeb, // NHWC derivatives + aBcd16b, aBcde16b, aBcd8b, aBcde8b, aBcd4b, + aBcde4b); // blocked layouts + return tag; +} +} // namespace + +struct cudnn_conv_inner_product_fwd_t : public cudnn_inner_product_fwd_t { + using cudnn_inner_product_fwd_t::cudnn_inner_product_fwd_t; + using parent_pd_t = cudnn_inner_product_fwd_t::pd_t; + struct pd_t : public parent_pd_t { + using parent_pd_t::parent_pd_t; + + DECLARE_COMMON_PD_T("cuda:cudnn:conv", cudnn_conv_inner_product_fwd_t); + + status_t init(engine_t *engine) { + using namespace data_type; + using namespace prop_kind; + const auto attr_skip_mask = primitive_attr_t::skip_mask_t::oscale + | primitive_attr_t::skip_mask_t::post_ops; + // Flag for checking if the fused routine can be used for the + // blocked format case. If set to true, that implies ReLU and + // blocking are used. + bool use_fused_path_for_blocking = false; + bool ok = true && set_default_params() == status::success; + ok = ok + && utils::one_of(desc()->prop_kind, forward_training, + forward_inference) + && data_types_ok() && memory_format_ok(src_md()) + && memory_format_ok(weights_md(0)) + && memory_format_ok(dst_md()) + && blocking_ok(with_eltwise(), use_fused_path_for_blocking) + && IMPLICATION(with_bias(), memory_format_ok(weights_md(1))) + && attr()->has_default_values(attr_skip_mask) + && post_ops_ok(attr()) + && IMPLICATION(!attr()->output_scales_.has_default_values(), + utils::one_of(src_md_.data_type, s8) + && attr()->output_scales_.mask_ == 0); + if (!ok) return status::unimplemented; + if (has_zero_dim_memory()) return status::success; + + inner_product_impl_.reset( + new cudnn_conv_inner_product_fwd_impl_t()); + + auto st = inner_product_impl_->init(engine, this, with_relu(), + with_eltwise(), with_sum(), use_fused_path_for_blocking); + return st; + } + bool post_ops_ok(const primitive_attr_t *attr) const { + const auto &p = attr->post_ops_; + + auto is_eltwise + = [&](int idx) { return p.entry_[idx].is_eltwise(false); }; + auto is_sum = [&](int idx) { return p.entry_[idx].is_sum(false); }; + + switch (p.len()) { + case 0: return true; // no post_ops + case 1: return is_eltwise(0) || is_sum(0); // sum OR eltwise + case 2: return is_sum(0) && is_eltwise(1); // sum -> eltwise + default: return false; + } + + return false; + } + bool with_eltwise() const { + return attr()->post_ops_.find(primitive_kind::eltwise) != -1; + } + + bool with_relu() const { + auto idx = attr()->post_ops_.find(primitive_kind::eltwise); + if (idx != -1) { return attr()->post_ops_.entry_[idx].is_relu(); } + return false; + } + + bool with_sum() const { + return attr()->post_ops_.find(primitive_kind::sum) != -1; + } + + status_t set_default_params() { + using namespace format_tag; + + // Although cuDNN does support arbitrary striding in the src + // and dst tensors, it does not support filters in any format + // where the N dimension follows the C dimension. So transpose the + // filter here if that is that case, and the src along with it. + auto set_default = [&]() { + if (ndims() < 5 && src_md_.data_type == data_type::s8) { + CHECK(init_mem_by_tag( + utils::pick(ndims() - 2, ab, acb, acdb, acdeb), + src_md_)); + } else { + CHECK(init_mem_by_tag( + utils::pick(ndims() - 2, ab, abc, abcd, abcde), + src_md_)); + } + CHECK(init_mem_by_tag(get_tag(src_md_), weights_md_)); + + return status::success; + }; + + if ((src_md()->format_kind == format_kind::any) + && (weights_md(0)->format_kind == format_kind::any)) { + CHECK(set_default()); + } else if ((src_md()->format_kind == format_kind::any) + && (weights_md(0)->format_kind != format_kind::any)) { + CHECK(init_mem_by_tag(get_tag(weights_md_), src_md_)); + } else if ((src_md()->format_kind != format_kind::any) + && (weights_md(0)->format_kind == format_kind::any)) { + CHECK(init_mem_by_tag(get_tag(src_md_), weights_md_)); + } + + if (dst_md()->format_kind == format_kind::any) + CHECK(memory_desc_init_by_tag(dst_md_, nc)); + if (weights_md(1)->format_kind == format_kind::any) + CHECK(memory_desc_init_by_tag(bias_md_, x)); + return status::success; + } + + bool blocking_ok( + bool with_relu, bool &use_fused_path_for_blocking) const { + // Bias and dst should not be blocked. + if (weights_md(1)->format_desc.blocking.inner_nblks + + dst_md()->format_desc.blocking.inner_nblks + != 0) + return false; + // If the src and filter are not blocked, done. + if (src_md()->format_desc.blocking.inner_nblks + + weights_md(0)->format_desc.blocking.inner_nblks + == 0) + return true; + + use_fused_path_for_blocking = with_relu; + // Otherwise check blocking is done on C dimension, that the block + // size is 4, that INT8 is used, that both srcs are blocked, and + // check whether ReLU is used (this enables the fast path). + return memory_desc_matches_nchw_vect_c(src_md()) + && memory_desc_matches_nchw_vect_c(weights_md(0)); + } + + bool data_types_ok() const { + using namespace data_type; + dnnl_data_type_t src_type = src_md()->data_type; + dnnl_data_type_t weights_type = weights_md(0)->data_type; + dnnl_data_type_t bias_type = weights_md(1)->data_type; + dnnl_data_type_t dst_type = dst_md()->data_type; + dnnl_data_type_t acc_type = desc()->accum_data_type; + + bool src_wei_match = src_type == weights_type; + + // If no bias used, there is no need to check it + auto bias_may_use_type = with_bias() ? bias_type : src_type; + bool bias_match = IMPLICATION(with_bias(), + bias_type == f32 + || utils::everyone_is(f16, src_type, weights_type, + bias_type, dst_type)); + + bool acc_match = src_wei_match && src_type == s8 + ? acc_type == s32 + : bias_match && bias_may_use_type == f16 ? acc_type == f16 + : acc_type == f32; + + switch (dst_type) { + case f32: + return src_wei_match && bias_match && acc_match + && src_type == f32; + case f16: + return bias_match && acc_match && bias_may_use_type == f16; + case s8: + return src_wei_match && acc_match && weights_type == s8; + } + return false; + } + }; + + const pd_t *pd() const override { + return (const pd_t *)primitive_t::pd().get(); + } +}; + +struct cudnn_conv_inner_product_bwd_data_t + : public cudnn_inner_product_bwd_data_t { + using cudnn_inner_product_bwd_data_t::cudnn_inner_product_bwd_data_t; + using parent_pd_t = cudnn_inner_product_bwd_data_t::pd_t; + struct pd_t : public parent_pd_t { + using parent_pd_t::parent_pd_t; + + DECLARE_COMMON_PD_T( + "cuda:cudnn:conv", cudnn_conv_inner_product_bwd_data_t); + + status_t init(engine_t *engine) { + using namespace data_type; + using namespace prop_kind; + + bool ok = true && set_default_params() == status::success; + ok = ok && desc()->prop_kind == backward_data && data_types_ok() + && no_blocking() && attr()->has_default_values() + && memory_format_ok(diff_src_md()) + && memory_format_ok(weights_md(0)) + && memory_format_ok(diff_dst_md()); + + if (!ok) return status::unimplemented; + if (has_zero_dim_memory()) return status::success; + + inner_product_impl_.reset( + new cudnn_conv_inner_product_bwd_data_impl_t()); + + return inner_product_impl_->init( + engine, this, false, false, false, false); + } + + status_t set_default_params() { + using namespace format_tag; + + auto set_default_diff_src = [&]() { + if (weights_md_.format_kind == format_kind::any) { + CHECK(init_mem_by_tag( + utils::pick(ndims() - 2, ab, abc, abcd, abcde), + diff_src_md_)); + } else { + CHECK(init_mem_by_tag(get_tag(weights_md_), diff_src_md_)); + } + return status::success; + }; + + auto set_default_weights = [&]() { + CHECK(init_mem_by_tag(get_tag(diff_src_md_), weights_md_)); + return status::success; + }; + + if (diff_src_md_.format_kind == format_kind::any) + CHECK(set_default_diff_src()); + if (weights_md_.format_kind == format_kind::any) + CHECK(set_default_weights()); + if (diff_dst_md_.format_kind == format_kind::any) + CHECK(memory_desc_init_by_tag(diff_dst_md_, nc)); + return status::success; + } + + bool no_blocking() const { + return diff_src_md()->format_desc.blocking.inner_nblks + + weights_md(0)->format_desc.blocking.inner_nblks + + diff_dst_md()->format_desc.blocking.inner_nblks + == 0; + } + + bool data_types_ok() const { + return utils::everyone_is(data_type::f32, diff_src_md()->data_type, + weights_md(0)->data_type, diff_dst_md()->data_type, + desc()->accum_data_type); + } + }; + + const pd_t *pd() const override { + return (const pd_t *)primitive_t::pd().get(); + } +}; + +struct cudnn_conv_inner_product_bwd_weights_t + : public cudnn_inner_product_bwd_weights_t { + using cudnn_inner_product_bwd_weights_t::cudnn_inner_product_bwd_weights_t; + using parent_pd_t = cudnn_inner_product_bwd_weights_t::pd_t; + struct pd_t : public parent_pd_t { + using parent_pd_t::parent_pd_t; + DECLARE_COMMON_PD_T( + "cuda:cudnn:conv", cudnn_conv_inner_product_bwd_weights_t); + + status_t init(engine_t *engine) { + using namespace data_type; + using namespace prop_kind; + bool ok = true && (set_default_params() == status::success); + ok = ok && (desc()->prop_kind == backward_weights) + && data_types_ok() && no_blocking() + && attr()->has_default_values() + && memory_format_ok(src_md()) + && memory_format_ok(diff_weights_md(0)) + && memory_format_ok(diff_dst_md()) + && IMPLICATION( + with_bias(), memory_format_ok(diff_weights_md(1))); + + if (!ok) return status::unimplemented; + if (has_zero_dim_memory()) return status::success; + + inner_product_impl_.reset( + new cudnn_conv_inner_product_bwd_weights_impl_t()); + + return inner_product_impl_->init( + engine, this, false, false, false, false); + } + + status_t set_default_params() { + using namespace format_tag; + + auto set_default_src = [&]() { + if (diff_weights_md_.format_kind == format_kind::any) { + CHECK(init_mem_by_tag( + utils::pick(ndims() - 2, ab, abc, abcd, abcde), + src_md_)); + } else { + CHECK(init_mem_by_tag(get_tag(diff_weights_md_), src_md_)); + } + return status::success; + }; + + auto set_default_diff_weights = [&]() { + CHECK(init_mem_by_tag(get_tag(src_md_), diff_weights_md_)); + return status::success; + }; + + if (src_md_.format_kind == format_kind::any) + CHECK(set_default_src()); + if (diff_weights_md_.format_kind == format_kind::any) + CHECK(set_default_diff_weights()); + if (diff_dst_md_.format_kind == format_kind::any) + CHECK(memory_desc_init_by_tag(diff_dst_md_, nc)); + if (diff_bias_md_.format_kind == format_kind::any) + CHECK(memory_desc_init_by_tag(diff_bias_md_, x)); + return status::success; + } + + bool no_blocking() const { + return src_md()->format_desc.blocking.inner_nblks + + diff_weights_md(0)->format_desc.blocking.inner_nblks + + diff_weights_md(1)->format_desc.blocking.inner_nblks + + diff_dst_md()->format_desc.blocking.inner_nblks + == 0; + } + + bool data_types_ok() const { + return IMPLICATION(with_bias(), + diff_weights_md(1)->data_type == data_type::f32) + && utils::everyone_is(data_type::f32, src_md()->data_type, + diff_weights_md(0)->data_type, + diff_dst_md()->data_type, desc()->accum_data_type); + } + }; + + const pd_t *pd() const override { + return (const pd_t *)primitive_t::pd().get(); + } +}; + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/nvidia/cudnn_conv_inner_product_impl.hpp b/src/gpu/nvidia/cudnn_conv_inner_product_impl.hpp new file mode 100644 index 00000000000..a65b2ab53a7 --- /dev/null +++ b/src/gpu/nvidia/cudnn_conv_inner_product_impl.hpp @@ -0,0 +1,701 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_NVIDIA_CUDNN_CONV_INNER_PRODUCT_IMPL_HPP +#define GPU_NVIDIA_CUDNN_CONV_INNER_PRODUCT_IMPL_HPP + +#include "cublas_v2.h" +#include "cudnn.h" + +#include "common/type_helpers.hpp" +#include "gpu/nvidia/cudnn_conv_filter_adjustment_base.hpp" +#include "gpu/nvidia/cudnn_inner_product_impl.hpp" +#include "gpu/nvidia/sycl_cuda_engine.hpp" +#include "gpu/nvidia/sycl_cuda_stream.hpp" +#include "gpu/nvidia/sycl_cuda_utils.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +struct cudnn_conv_inner_product_impl_base_t + : public cudnn_inner_product_fwd_base_t, + public cudnn_conv_filter_adjustment_base_t { + + bool unfold_dimensions_ = false; + cudnnConvolutionDescriptor_t conv_desc_ = nullptr; + cudnnFilterDescriptor_t filter_desc_; + + status_t filter_tag( + const memory_desc_t &md, format_tag_t &weight_tag) const { + using namespace format_tag; + weight_tag = memory_desc_matches_one_of_tag(md, oidhw, odhwi, dhwio, + oihw, ohwi, hwio, oiw, owi, wio, aBcd4b, + any); // blocked layouts + if (weight_tag == undef) return status::unimplemented; + return status::success; + } + + status_t source_tag(const memory_desc_t &md, format_tag_t &src_tag) const { + using namespace format_tag; + src_tag = memory_desc_matches_one_of_tag( + md, ncdhw, ndhwc, nchw, nhwc, ncw, nwc, aBcd4b, any); + if (src_tag == undef) return status::unimplemented; + return status::success; + } + + virtual ~cudnn_conv_inner_product_impl_base_t() { + if (conv_desc_) { + CUDNN_EXECUTE_FUNC_V(cudnnDestroyConvolutionDescriptor, conv_desc_); + } + if (filter_desc_) { + CUDNN_EXECUTE_FUNC_V(cudnnDestroyFilterDescriptor, filter_desc_); + } + for (size_t i = 0; i < NUM_IO - 1; i++) { + if (tensor_descs_[i]) { + CUDNN_EXECUTE_FUNC_V( + cudnnDestroyTensorDescriptor, tensor_descs_[i]); + } + } + } + + void unfold_dims(io memory_index, int *folded_dims, int *folded_strides, + cudnnTensorFormat_t format, int ndims) { + folded_dims[0] = dims_[memory_index][0]; + folded_dims[1] = dims_[memory_index][1]; + for (int i = 2; i < ndims; i++) { + folded_dims[1] *= dims_[memory_index][i]; + folded_dims[i] = 1; + } + for (int i = 2; i < ndims; i++) { + folded_strides[i] + = (format == CUDNN_TENSOR_NHWC ? folded_dims[1] : 1); + } + + folded_strides[1] = 1; + folded_strides[0] = folded_dims[1]; + } + + virtual void execute(cudnnHandle_t handle, cublasHandle_t, + const std::vector &args) const = 0; +}; + +struct cudnn_conv_inner_product_fwd_impl_t + : public cudnn_conv_inner_product_impl_base_t { + bool use_fused_path_for_blocking_ = false; + bool input_is_blocked_ = false; + bool filter_is_blocked_ = false; + cudnnConvolutionFwdAlgo_t algo_; + cudnnActivationDescriptor_t act_desc_fuse_relu; + cudnnActivationDescriptor_t act_desc_no_relu_; + cudnnTensorFormat_t source_format_; + + ~cudnn_conv_inner_product_fwd_impl_t() { + if (with_bias_) { + CUDNN_EXECUTE_FUNC_V( + cudnnDestroyActivationDescriptor, act_desc_fuse_relu); + } + if ((with_eltwise_ && !with_relu_) || (!with_bias_ && with_relu_)) { + CUDNN_EXECUTE_FUNC_V( + cudnnDestroyActivationDescriptor, act_desc_no_relu_); + } + } + virtual status_t init(engine_t *engine, inner_product_pd_t *pd, + bool with_relu, bool with_eltwise, bool with_sum, + bool use_fuse_path_for_blocking) override { + with_bias_ = pd->with_bias(); + with_relu_ = with_relu; + with_eltwise_ = with_eltwise; + use_fused_path_for_blocking_ = use_fuse_path_for_blocking; + output_scales_ = pd->attr()->output_scales_.scales_[0]; + with_sum_ = with_sum; + scale_bias_ = (output_scales_ != 1) && with_bias_; + // scaling factor to add the previous destination value to the current + // computation + sum_scale_ = sum_scale(pd); + input_is_blocked_ + = pd->src_md()->format_desc.blocking.inner_blks[0] == 4; + filter_is_blocked_ + = pd->weights_md(0)->format_desc.blocking.inner_blks[0] == 4; + // Pad out the dimensions to at least 4. + if (pd->ndims() > CUDNN_DIM_MAX || pd->ndims() < 2) { + return status::invalid_arguments; + } + ndims_ = pd->ndims() < 4 ? 4 : pd->ndims(); + // Initialise meta-data from the descriptors. + // Convert the padded dimensions to the dimensions expected by cuDNN. + get_4d_tensor_descriptor( + pd->src_md(), dims_[io::src], strides_[io::src]); + get_4d_tensor_descriptor( + pd->weights_md(), dims_[io::wei], strides_[io::wei]); + get_4d_tensor_descriptor( + pd->dst_md(), dims_[io::dst], strides_[io::dst]); + + // Convert oneDNN data types to their cuDNN counterparts. + CHECK(convert_data_type(pd->src_md(), &data_types_[io::src])); + CHECK(convert_data_type(pd->weights_md(0), &data_types_[io::wei])); + if (input_is_blocked_) { + data_types_[io::dst] = CUDNN_DATA_INT8x4; + } else { + CHECK(convert_data_type(pd->dst_md(), &data_types_[io::dst])); + } + + // Ensure INT8 types are accumulated with INT32. + if (data_types_[io::src] != CUDNN_DATA_HALF + && data_types_[io::src] != CUDNN_DATA_FLOAT) { + data_types_[NUM_IO] = CUDNN_DATA_INT32; + } + + cudnnTensorFormat_t weights_format; + format_tag_t w_tag, s_tag; + CHECK(filter_tag(*pd->weights_md(0), w_tag)); + CHECK(source_tag(*pd->src_md(0), s_tag)); + CHECK(get_format( + pd->src_md(), source_format_, pd->src_md()->ndims == 2)); + + // Currently cuDNN does not support + // cudnnConvolutionBiasActivationForward + // for 5D convolution. Therefore we have to unfold the dims for 5d when + // it is 5d. Also cuDNN does not support s8 type and nhwc format for + // 5d convolution. + unfold_dimensions_ = ndims_ > 4 + && ((pd->weights_md(0)->data_type == data_type::s8) + || (source_format_ == CUDNN_TENSOR_NHWC) || with_bias_); + + if (!supported_filter_format(pd->weights_md(0)) + || (unfold_dimensions_ && (w_tag != s_tag)) + || ((source_format_ == CUDNN_TENSOR_NCHW) + && (w_tag != s_tag))) { + set_filter_format( + ndims_, dims_[io::wei], strides_[NUM_IO], source_format_); + CHECK(init_filter_transformation(data_types_[io::wei], ndims_, + dims_[io::wei], strides_[io::wei], strides_[NUM_IO])); + filter_using_spatial_format_ = true; + // we transform the filter based on src format + weights_format = source_format_; + pd->scratchpad_registry().registrar().book( + memory_tracking::names::key_none, + memory_desc_wrapper(pd->weights_md(0)).size(), size_t(1)); + } else { + CHECK(get_format(pd->weights_md(0), weights_format, + pd->weights_md(0)->ndims == 2)); + } + + if (scale_bias_) { + + pd->scratchpad_registry().registrar().book( + memory_tracking::names::key_conv_adjusted_scales, + memory_desc_wrapper(pd->weights_md(1)).size(), size_t(1)); + } + + // Copy over the strides. + if (with_bias_) { + CHECK(convert_data_type(pd->weights_md(1), &data_types_[io::bia])); + set_bias_dims(weights_format, ndims_, pd->OC()); + } + + // cuDNN requires Input and output feature maps to be a multiple of 4 + // for int8. only nhwc is supported for int8// cudnn doesnot support + // 5d convolution format for int8 + if ((pd->weights_md(0)->data_type == data_type::s8) + && ((pd->IC() % 4 != 0) || (pd->OC() % 4 != 0))) { + return status::unimplemented; + } + // source format and weight format are the same at this stage + if (unfold_dimensions_) { + unfold_dims(io::wei, dims_[io::wei], strides_[io::wei], + source_format_, ndims_); + unfold_dims(io::src, dims_[io::src], strides_[io::src], + source_format_, ndims_); + ndims_ = 4; + } + + if (input_is_blocked_) { + CHECK(create_and_set_tensor_descriptor_ex(&tensor_descs_[io::src], + CUDNN_TENSOR_NCHW_VECT_C, data_types_[io::src], ndims_, + dims_[io::src])); + } else { + CHECK(create_and_set_tensor_descriptor(&tensor_descs_[io::src], + data_types_[io::src], ndims_, dims_[io::src], + strides_[io::src])); + } + if (with_bias_) { + CHECK(create_and_set_tensor_descriptor(&tensor_descs_[io::bia], + data_types_[io::bia], ndims_, dims_[io::bia], + strides_[io::bia])); + } + // If input is blocked, the output needs to be as well. + if (input_is_blocked_) { + CHECK(create_and_set_tensor_descriptor_ex(&tensor_descs_[io::dst], + CUDNN_TENSOR_NCHW_VECT_C, data_types_[io::dst], ndims_, + dims_[io::dst])); + } else { + cudnnTensorFormat_t out_format + = filter_is_blocked_ ? CUDNN_TENSOR_NCHW : weights_format; + CHECK(create_and_set_tensor_descriptor_ex(&tensor_descs_[io::dst], + out_format, data_types_[io::dst], ndims_, dims_[io::dst])); + } + + CHECK(create_and_set_filter_descriptor(&filter_desc_, weights_format, + data_types_[io::wei], ndims_, dims_[io::wei], + strides_[io::wei])); + + // Set the convolution. For inner product, this means unit strides and + // dilation, no padding, and with cross-correlation as the mode. + int conv_dims = ndims_ - 2; + std::vector unit_strides(conv_dims, 1); + std::vector unit_dilation(conv_dims, 1); + std::vector zero_padding(conv_dims, 0); + + CHECK(create_and_set_conv_descriptor(&conv_desc_, conv_dims, + zero_padding.data(), unit_strides.data(), unit_dilation.data(), + CUDNN_CROSS_CORRELATION, data_types_[NUM_IO])); + + auto &sycl_engine = *utils::downcast(engine); + stream_t *service_stream; + CHECK(sycl_engine.get_service_stream(service_stream)); + + auto cuda_stream + = utils::downcast(service_stream); + auto handle = cuda_stream->get_cudnn_handle(); + + // Inner product can choose whatever algorithm it prefers, although + // for the identity post-op the IMPLICIT_PRECOMP_GEMM must be used. + // there is a bug in nvidia that cannot support + // cudnnGetConvolutionForwardAlgorithm for int8 type + if (pd->src_md()->data_type != data_type::s8 + && pd->weights_md(0)->data_type != data_type::s8) { + cudnnConvolutionFwdPreference_t algo_pref + = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST; + + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnGetConvolutionForwardAlgorithm, + handle, tensor_descs_[io::src], filter_desc_, conv_desc_, + tensor_descs_[io::dst], algo_pref, 0, &algo_)); + } else { + algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; + } + if (!with_relu_) { + algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; + } + + // Allocate the workspace from the algorithm selection, if applicable. + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnGetConvolutionForwardWorkspaceSize, + handle, tensor_descs_[io::src], filter_desc_, conv_desc_, + tensor_descs_[io::dst], algo_, &workspace_size_)); + if (workspace_size_ > 0) { + pd->scratchpad_registry().registrar().book( + memory_tracking::names::key_iprod_int_dat_in_acc_dt, + workspace_size_, size_t(1)); + } + + // Add the eltwise op. Note that this only applies to the forward pass. + CHECK(create_and_set_op_descriptor(pd)); + return status::success; + } + + void execute(cudnnHandle_t handle, cublasHandle_t, + const std::vector &args) const override { + auto x = args[0], w = args[1], b = args[2], y = args[3], + workspace = args[4]; + assert(args.size() == 7); + auto w_arg = w; + if (filter_using_spatial_format_) { + void *transformed_w = args[5]; + transform_filter(handle, w, transformed_w); + w_arg = transformed_w; + } + + if (with_bias_) { + auto scaled_bias = b; + if (scale_bias_) { + void *output_scale_workspace = args[6]; + CUDNN_EXECUTE_FUNC(cudnnAddTensor, handle, &output_scales_, + tensor_descs_[io::bia], b, &beta_, + tensor_descs_[io::bia], output_scale_workspace); + scaled_bias = output_scale_workspace; + } + + CUDNN_EXECUTE_FUNC(cudnnConvolutionBiasActivationForward, handle, + &output_scales_, tensor_descs_[io::src], x, filter_desc_, + w_arg, conv_desc_, algo_, workspace, workspace_size_, + &sum_scale_, tensor_descs_[io::dst], y, + tensor_descs_[io::bia], scaled_bias, act_desc_fuse_relu, + tensor_descs_[io::dst], y); + } else { + CUDNN_EXECUTE_FUNC(cudnnConvolutionForward, handle, &output_scales_, + tensor_descs_[io::src], x, filter_desc_, w_arg, conv_desc_, + algo_, workspace, workspace_size_, &sum_scale_, + tensor_descs_[io::dst], y); + } + if ((with_eltwise_ && !with_relu_) || (!with_bias_ && with_relu_)) { + CUDNN_EXECUTE_FUNC(cudnnActivationForward, handle, + act_desc_no_relu_, &alpha_, tensor_descs_[io::dst], y, + &beta_, tensor_descs_[io::dst], y); + } + } + +private: + status_t create_and_set_op_descriptor(inner_product_pd_t *pd) { + if (with_bias_) { + auto mode_fuse = with_relu_ ? CUDNN_ACTIVATION_RELU + : CUDNN_ACTIVATION_IDENTITY; + CHECK(CUDNN_EXECUTE_FUNC_S( + cudnnCreateActivationDescriptor, &act_desc_fuse_relu)); + // For ReLU, a ceiling of 0 means no limit. + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetActivationDescriptor, + act_desc_fuse_relu, mode_fuse, + cudnnNanPropagation_t::CUDNN_NOT_PROPAGATE_NAN, + eltwise_alpha(pd))); + } + if ((with_eltwise_ && !with_relu_) || (!with_bias_ && with_relu_)) { + CHECK(CUDNN_EXECUTE_FUNC_S( + cudnnCreateActivationDescriptor, &act_desc_no_relu_)); + + cudnnActivationMode_t no_relu_mode; + switch (eltwise_algorithm_kind(pd)) { + case alg_kind::eltwise_tanh: + no_relu_mode = CUDNN_ACTIVATION_TANH; + break; + case alg_kind::eltwise_elu: + no_relu_mode = CUDNN_ACTIVATION_ELU; + break; + case alg_kind::eltwise_relu: + no_relu_mode = CUDNN_ACTIVATION_RELU; + break; + case alg_kind::eltwise_logistic: + no_relu_mode = CUDNN_ACTIVATION_SIGMOID; + break; + case alg_kind::eltwise_bounded_relu: + no_relu_mode = CUDNN_ACTIVATION_CLIPPED_RELU; + break; + default: return status::unimplemented; + } + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetActivationDescriptor, + act_desc_no_relu_, no_relu_mode, + cudnnNanPropagation_t::CUDNN_NOT_PROPAGATE_NAN, + eltwise_alpha(pd))); + } + return status::success; + } +}; + +struct cudnn_conv_inner_product_bwd_data_impl_t + : public cudnn_conv_inner_product_impl_base_t { + cudnnConvolutionBwdDataAlgo_t algo_; + // the type of filter depends on dy, however since dy is nc + // for nhwc filter the source must be nhwc as well. + // So we use the src type for transforming the filter. + cudnnTensorFormat_t diff_source_format_; + virtual status_t init(engine_t *engine, inner_product_pd_t *pd, + bool /*with_relu*/, bool /*with_eltwise*/, bool /*with_sum */, + bool /*using_fused_path_for_blocking*/) override { + // Pad out the dimensions to 4 + if (pd->ndims() > CUDNN_DIM_MAX || pd->ndims() < 2) { + return status::invalid_arguments; + } + ndims_ = pd->ndims() < 4 ? 4 : pd->ndims(); + // Initialise meta-data from the descriptors. + // Convert the padded dimensions to the dimensions expected by cuDNN. + get_4d_tensor_descriptor( + pd->diff_src_md(), dims_[io::src], strides_[io::src]); + get_4d_tensor_descriptor( + pd->weights_md(), dims_[io::wei], strides_[io::wei]); + get_4d_tensor_descriptor( + pd->diff_dst_md(), dims_[io::dst], strides_[io::dst]); + + // Convert oneDNN data types to their cuDNN counterparts. + CHECK(convert_data_type(pd->diff_src_md(), &data_types_[io::src])); + CHECK(convert_data_type(pd->weights_md(0), &data_types_[io::wei])); + CHECK(convert_data_type(pd->diff_dst_md(), &data_types_[io::dst])); + + format_tag_t w_tag, s_tag; + CHECK(filter_tag(*pd->weights_md(0), w_tag)); + CHECK(source_tag(*pd->diff_src_md(0), s_tag)); + cudnnTensorFormat_t weights_format; + CHECK(get_format(pd->diff_src_md(), diff_source_format_)); + // Currently nvidia does not support cudnnConvolution + // for 5D convolution when the filter format is nhwc. + // Therefore we have to unfold the dims for 5d when it is 5d. + unfold_dimensions_ + = ndims_ > 4 && ((diff_source_format_ == CUDNN_TENSOR_NHWC)); + // Copy over the strides. + // weight format and dy format must be the same, since dx is the result + // here, we check with diff_src, to make sure we get the correct result. + if (!supported_filter_format(pd->weights_md(0)) || (w_tag != s_tag)) { + set_filter_format(ndims_, dims_[io::wei], strides_[NUM_IO], + diff_source_format_); + CHECK(init_filter_transformation(data_types_[io::wei], ndims_, + dims_[io::wei], strides_[io::wei], strides_[NUM_IO])); + filter_using_spatial_format_ = true; + // the type of weight format must match + weights_format = diff_source_format_; + pd->scratchpad_registry().registrar().book( + memory_tracking::names::key_none, + memory_desc_wrapper(pd->weights_md(0)).size(), size_t(1)); + } else { + CHECK(get_format(pd->weights_md(0), weights_format)); + } + + // source format and weight format are the same at this stage + if (unfold_dimensions_) { + unfold_dims(io::wei, dims_[io::wei], strides_[io::wei], + diff_source_format_, ndims_); + unfold_dims(io::src, dims_[io::src], strides_[io::src], + diff_source_format_, ndims_); + ndims_ = 4; + } + + // Set the tensor descriptors from the dimensions and strides. + CHECK(create_and_set_tensor_descriptor(&tensor_descs_[io::src], + data_types_[io::src], ndims_, dims_[io::src], + strides_[io::src])); + + CHECK(create_and_set_tensor_descriptor(&tensor_descs_[io::dst], + data_types_[io::dst], ndims_, dims_[io::dst], + strides_[io::dst])); + + CHECK(create_and_set_filter_descriptor(&filter_desc_, weights_format, + data_types_[io::wei], ndims_, dims_[io::wei], + strides_[io::wei])); + + // Set the convolution. For inner product, this means unit strides and + // dilation, no padding, and with cross-correlation as the mode. + int conv_dims = ndims_ - 2; + std::vector unit_strides(conv_dims, 1); + std::vector unit_dilation(conv_dims, 1); + std::vector zero_padding(conv_dims, 0); + + CHECK(create_and_set_conv_descriptor(&conv_desc_, conv_dims, + zero_padding.data(), unit_strides.data(), unit_dilation.data(), + CUDNN_CROSS_CORRELATION, data_types_[NUM_IO])); + auto &sycl_engine = *utils::downcast(engine); + stream_t *service_stream; + CHECK(sycl_engine.get_service_stream(service_stream)); + + auto cuda_stream + = utils::downcast(service_stream); + auto handle = cuda_stream->get_cudnn_handle(); + + // Inner product can choose whatever algorithm it prefers. + cudnnConvolutionBwdDataPreference_t algo_pref + = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST; + + CUDNN_EXECUTE_FUNC(cudnnGetConvolutionBackwardDataAlgorithm, handle, + filter_desc_, tensor_descs_[io::dst], conv_desc_, + tensor_descs_[io::src], algo_pref, 0, &algo_); + + // Allocate the workspace from the algorithm selection, if applicable. + CUDNN_EXECUTE_FUNC(cudnnGetConvolutionBackwardDataWorkspaceSize, handle, + filter_desc_, tensor_descs_[io::dst], conv_desc_, + tensor_descs_[io::src], algo_, &workspace_size_); + + if (workspace_size_ > 0) { + pd->scratchpad_registry().registrar().book( + memory_tracking::names::key_iprod_int_dat_in_acc_dt, + workspace_size_, size_t(1)); + } + + return status::success; + } + + void execute(cudnnHandle_t handle, cublasHandle_t, + const std::vector &args) const override { + assert(args.size() == 5); + auto dx = args[0], w = args[1], dy = args[2], workspace = args[3]; + auto w_arg = w; + if (filter_using_spatial_format_) { + auto transformed_w = args[4]; + transform_filter(handle, w, transformed_w); + w_arg = transformed_w; + } + CUDNN_EXECUTE_FUNC(cudnnConvolutionBackwardData, handle, &alpha_, + filter_desc_, w_arg, tensor_descs_[io::dst], dy, conv_desc_, + algo_, workspace, workspace_size_, &beta_, + tensor_descs_[io::src], dx); + } +}; + +struct cudnn_conv_inner_product_bwd_weights_impl_t + : public cudnn_conv_inner_product_impl_base_t { + cudnnConvolutionBwdFilterAlgo_t algo_; + cudnnTensorFormat_t source_format_; + + virtual status_t init(engine_t *engine, inner_product_pd_t *pd, + bool /*with_relu*/, bool /*with_eltwise*/, bool /*with_sum */, + bool /*using_fused_path_for_blocking*/) override { + // If any of the dimensions are 0 we should not continue with creating + // cudnn descriptors + with_bias_ = pd->with_bias(); + + // Pad out the dimensions to 4 + if (pd->ndims() > CUDNN_DIM_MAX || pd->ndims() < 2) { + return status::invalid_arguments; + } + ndims_ = pd->ndims() < 4 ? 4 : pd->ndims(); + + // Initialise meta-data from the descriptors. + // Convert the padded dimensions to the dimensions expected by cuDNN. + get_4d_tensor_descriptor( + pd->src_md(), dims_[io::src], strides_[io::src]); + get_4d_tensor_descriptor( + pd->diff_weights_md(), dims_[io::wei], strides_[io::wei]); + get_4d_tensor_descriptor( + pd->diff_dst_md(), dims_[io::dst], strides_[io::dst]); + + format_tag_t w_tag, s_tag; + CHECK(filter_tag(*pd->diff_weights_md(0), w_tag)); + CHECK(source_tag(*pd->src_md(0), s_tag)); + + cudnnTensorFormat_t diff_weights_format; + CHECK(get_format(pd->src_md(0), source_format_)); + // Currently nvidia does not support cudnnConvolution + // for 5D convolution when the filter format is nhwc. + // Therefore we have to unfold the dims for 5d when it is 5d. + unfold_dimensions_ + = ndims_ > 4 && ((source_format_ == CUDNN_TENSOR_NHWC)); + // weight format and src format must be the same. + // we check with src, to make sure we get the correct result. + if (!supported_filter_format(pd->diff_weights_md(0)) + || (w_tag != s_tag)) { + set_filter_format( + ndims_, dims_[io::wei], strides_[NUM_IO], source_format_); + CHECK(init_filter_transformation(data_types_[io::wei], ndims_, + dims_[io::wei], strides_[NUM_IO], strides_[io::wei])); + filter_using_spatial_format_ = true; + // the type of weight format must match + diff_weights_format = source_format_; + pd->scratchpad_registry().registrar().book( + memory_tracking::names::key_none, + memory_desc_wrapper(pd->diff_weights_md(0)).size(), + size_t(1)); + } else { + CHECK(get_format(pd->diff_weights_md(0), diff_weights_format)); + } + + // Copy over the strides. + // Convert oneDNN data types to their cuDNN counterparts. + CHECK(convert_data_type(pd->src_md(), &data_types_[io::src])); + CHECK(convert_data_type(pd->diff_weights_md(0), &data_types_[io::wei])); + CHECK(convert_data_type(pd->diff_dst_md(), &data_types_[io::dst])); + + // source format and weight format are the same at this stage + if (unfold_dimensions_) { + unfold_dims(io::wei, dims_[io::wei], strides_[io::wei], + source_format_, ndims_); + unfold_dims(io::src, dims_[io::src], strides_[io::src], + source_format_, ndims_); + ndims_ = 4; + } + + if (with_bias_) { + set_bias_dims(diff_weights_format, ndims_, pd->OC()); + CHECK(convert_data_type( + pd->diff_weights_md(1), &data_types_[io::bia])); + } + // Set the tensor descriptors from the dimensions and strides. + CHECK(create_and_set_tensor_descriptor(&tensor_descs_[io::src], + data_types_[io::src], ndims_, dims_[io::src], + strides_[io::src])); + + CHECK(create_and_set_filter_descriptor(&filter_desc_, + diff_weights_format, data_types_[io::wei], ndims_, + dims_[io::wei], strides_[io::wei])); + + // oneDNN does not set unused dimensions and strides in the output, so + // we do that here. If nhwc filter, then repeat the N stride for the + // spatial dimensions. + + CHECK(create_and_set_tensor_descriptor(&tensor_descs_[io::dst], + data_types_[io::dst], ndims_, dims_[io::dst], + strides_[io::dst])); + if (with_bias_) { + CHECK(create_and_set_tensor_descriptor(&tensor_descs_[io::bia], + data_types_[io::bia], ndims_, dims_[io::bia], + strides_[io::bia])); + } + // Set the convolution. For inner product, this means unit strides and + // dilation, no padding, and with cross-correlation as the mode. + int conv_dims = ndims_ - 2; + std::vector unit_strides(conv_dims, 1); + std::vector unit_dilation(conv_dims, 1); + std::vector zero_padding(conv_dims, 0); + + CHECK(create_and_set_conv_descriptor(&conv_desc_, conv_dims, + zero_padding.data(), unit_strides.data(), unit_dilation.data(), + CUDNN_CROSS_CORRELATION, data_types_[NUM_IO])); + auto &sycl_engine = *utils::downcast(engine); + stream_t *service_stream; + CHECK(sycl_engine.get_service_stream(service_stream)); + + auto cuda_stream + = utils::downcast(service_stream); + auto handle = cuda_stream->get_cudnn_handle(); + + // Inner product can choose whatever algorithm it prefers. + cudnnConvolutionBwdFilterPreference_t algo_pref + = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST; + + CUDNN_EXECUTE_FUNC(cudnnGetConvolutionBackwardFilterAlgorithm, handle, + tensor_descs_[io::src], tensor_descs_[io::dst], conv_desc_, + filter_desc_, algo_pref, 0, &algo_); + + // Allocate the workspace from the algorithm selection, if applicable. + CUDNN_EXECUTE_FUNC_S(cudnnGetConvolutionBackwardFilterWorkspaceSize, + handle, tensor_descs_[io::src], tensor_descs_[io::dst], + conv_desc_, filter_desc_, algo_, &workspace_size_); + if (workspace_size_ > 0) { + pd->scratchpad_registry().registrar().book( + memory_tracking::names::key_iprod_int_dat_in_acc_dt, + workspace_size_, size_t(1)); + } + + return status::success; + } + + void execute(cudnnHandle_t handle, cublasHandle_t, + const std::vector &args) const override { + assert(args.size() == 6); + auto x = args[0], dy = args[1], dw = args[2], db = args[3], + workspace = args[4]; + + auto dw_arg = filter_using_spatial_format_ ? args[5] : dw; + CUDNN_EXECUTE_FUNC(cudnnConvolutionBackwardFilter, handle, &alpha_, + tensor_descs_[io::src], x, tensor_descs_[io::dst], dy, + conv_desc_, algo_, workspace, workspace_size_, &beta_, + filter_desc_, dw_arg); + + if (filter_using_spatial_format_) { + // The output of weight is in nvida specific format, + // however a user requires the oneDNN format as an output + transform_filter(handle, dw_arg, dw); + } + + if (with_bias_) { + CUDNN_EXECUTE_FUNC(cudnnConvolutionBackwardBias, handle, &alpha_, + tensor_descs_[io::dst], dy, &beta_, tensor_descs_[io::bia], + db); + } + } +}; + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/nvidia/cudnn_convolution.cpp b/src/gpu/nvidia/cudnn_convolution.cpp new file mode 100644 index 00000000000..a3a84b443a9 --- /dev/null +++ b/src/gpu/nvidia/cudnn_convolution.cpp @@ -0,0 +1,256 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "gpu/nvidia/cudnn_convolution.hpp" +#include "gpu/nvidia/sycl_cuda_scoped_context.hpp" +#include "gpu/nvidia/sycl_cuda_stream.hpp" +#include "gpu/nvidia/sycl_cuda_utils.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +status_t cudnn_convolution_fwd_t::execute_convolution( + const exec_ctx_t &ctx, bool with_bias, bool with_scratchpad) const { + nvidia::sycl_cuda_stream_t *cuda_stream + = utils::downcast(ctx.stream()); + + return cuda_stream->interop_task([&](cl::sycl::handler &cgh) { + using scratch_acc_t = cl::sycl::accessor; + auto x_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC); + auto weights_acc = CTX_IN_ACCESSOR(DNNL_ARG_WEIGHTS); + auto y_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST); + std::shared_ptr< + cl::sycl::accessor> + bias_acc; + std::shared_ptr scratch_acc; + std::shared_ptr filter_scratch_acc; + std::shared_ptr temp_dst_acc; + std::shared_ptr temp_reorder_acc; + if (with_scratchpad) { + scratch_acc = std::make_shared( + utils::downcast( + ctx.get_scratchpad_grantor() + .get_memory_storage(memory_tracking::names:: + key_conv_cudnn_algo) + .get()) + ->buffer() + .get_access( + cgh)); + } + if (with_bias) { + bias_acc = std::make_shared>( + CTX_IN_ACCESSOR(DNNL_ARG_BIAS)); + } + if (pd()->impl_->using_transformed_filter()) { + filter_scratch_acc + = std::make_shared(CTX_SCRATCH_ACCESSOR( + memory_tracking::names::key_conv_cudnn_filter)); + } + + if (pd()->use_temp_dst_) { + temp_dst_acc = std::make_shared( + buffer(scratch_storage.get()) + .get_access( + cgh)); + temp_reorder_acc = std::make_shared( + buffer(scratch_storage_2.get()) + .get_access( + cgh)); + } + + cgh.interop_task([=](const cl::sycl::interop_handler &ih) { + auto &sycl_engine = *utils::downcast( + cuda_stream->engine()); + auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine); + auto handle = cuda_stream->get_cudnn_handle(); + + std::vector args; + args.push_back(sc.memory(ih, x_acc)); + args.push_back(sc.memory(ih, weights_acc)); + args.push_back(sc.memory(ih, y_acc)); + args.push_back( + with_bias ? sc.memory(ih, *bias_acc) : nullptr); + args.push_back(with_scratchpad ? sc.memory(ih, *scratch_acc) + : nullptr); + args.push_back(pd()->impl_->using_transformed_filter() + ? sc.memory(ih, *filter_scratch_acc) + : nullptr); + args.push_back(pd()->use_temp_dst_ + ? sc.memory(ih, *temp_dst_acc) + : nullptr); + args.push_back(pd()->use_temp_dst_ + ? sc.memory(ih, *temp_reorder_acc) + : nullptr); + pd()->impl_->execute(handle, args); + }); + }); +} + +status_t cudnn_convolution_bwd_data_t::execute_convolution( + const exec_ctx_t &ctx, bool with_bias, bool with_scratchpad) const { + nvidia::sycl_cuda_stream_t *cuda_stream + = utils::downcast(ctx.stream()); + + return cuda_stream->interop_task([&](cl::sycl::handler &cgh) { + using scratch_acc_t = cl::sycl::accessor; + auto x_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_SRC); + auto weights_acc = CTX_IN_ACCESSOR(DNNL_ARG_WEIGHTS); + auto y_acc = CTX_IN_ACCESSOR(DNNL_ARG_DIFF_DST); + std::shared_ptr< + cl::sycl::accessor> + bias_acc; + std::shared_ptr scratch_acc; + std::shared_ptr filter_scratch_acc; + if (with_scratchpad) { + scratch_acc = std::make_shared( + utils::downcast( + ctx.get_scratchpad_grantor() + .get_memory_storage(memory_tracking::names:: + key_conv_cudnn_algo) + .get()) + ->buffer() + .get_access( + cgh)); + } + if (with_bias) { + bias_acc = std::make_shared>( + CTX_IN_ACCESSOR(DNNL_ARG_BIAS)); + } + if (pd()->impl_->using_transformed_filter()) { + filter_scratch_acc + = std::make_shared(CTX_SCRATCH_ACCESSOR( + memory_tracking::names::key_conv_cudnn_filter)); + } + cgh.interop_task([=](const cl::sycl::interop_handler &ih) { + auto &sycl_engine = *utils::downcast( + cuda_stream->engine()); + auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine); + auto handle = cuda_stream->get_cudnn_handle(); + + std::vector args; + args.push_back(sc.memory(ih, x_acc)); + args.push_back(sc.memory(ih, weights_acc)); + args.push_back(sc.memory(ih, y_acc)); + args.push_back( + with_bias ? sc.memory(ih, *bias_acc) : nullptr); + args.push_back(with_scratchpad ? sc.memory(ih, *scratch_acc) + : nullptr); + args.push_back(pd()->impl_->using_transformed_filter() + ? sc.memory(ih, *filter_scratch_acc) + : nullptr); + pd()->impl_->execute(handle, args); + }); + }); +} +status_t cudnn_convolution_bwd_weights_t::execute_zero_dims( + const exec_ctx_t &ctx) const { + nvidia::sycl_cuda_stream_t *cuda_stream + = utils::downcast(ctx.stream()); + + return cuda_stream->interop_task([&](cl::sycl::handler &cgh) { + auto weights_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_WEIGHTS); + std::shared_ptr< + cl::sycl::accessor> + bias_acc; + if (pd()->with_bias()) { + bias_acc = std::make_shared>( + CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_BIAS)); + } + cgh.interop_task([=](const cl::sycl::interop_handler &ih) { + auto &sycl_engine = *utils::downcast( + cuda_stream->engine()); + auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine); + auto handle = cuda_stream->get_cudnn_handle(); + + auto weights = sc.memory(ih, weights_acc); + void *bias = nullptr; + if (pd()->with_bias()) bias = sc.memory(ih, *bias_acc); + pd()->impl_->execute_set_weights_bias(handle, weights, bias, 0.f); + }); + }); +} +status_t cudnn_convolution_bwd_weights_t::execute_convolution( + const exec_ctx_t &ctx, bool with_bias, bool with_scratchpad) const { + nvidia::sycl_cuda_stream_t *cuda_stream + = utils::downcast(ctx.stream()); + + return cuda_stream->interop_task([&](cl::sycl::handler &cgh) { + using scratch_acc_t = cl::sycl::accessor; + auto x_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC); + auto weights_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_WEIGHTS); + auto y_acc = CTX_IN_ACCESSOR(DNNL_ARG_DIFF_DST); + std::shared_ptr< + cl::sycl::accessor> + bias_acc; + std::shared_ptr scratch_acc; + std::shared_ptr filter_scratch_acc; + if (with_scratchpad) { + scratch_acc = std::make_shared( + utils::downcast( + ctx.get_scratchpad_grantor() + .get_memory_storage(memory_tracking::names:: + key_conv_cudnn_algo) + .get()) + ->buffer() + .get_access( + cgh)); + } + if (with_bias) { + bias_acc = std::make_shared>( + CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_BIAS)); + } + if (pd()->impl_->using_transformed_filter()) { + filter_scratch_acc + = std::make_shared(CTX_SCRATCH_ACCESSOR( + memory_tracking::names::key_conv_cudnn_filter)); + } + + cgh.interop_task([=](const cl::sycl::interop_handler &ih) { + auto &sycl_engine = *utils::downcast( + cuda_stream->engine()); + auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine); + auto handle = cuda_stream->get_cudnn_handle(); + + std::vector args; + args.push_back(sc.memory(ih, x_acc)); + args.push_back(sc.memory(ih, weights_acc)); + args.push_back(sc.memory(ih, y_acc)); + args.push_back( + with_bias ? sc.memory(ih, *bias_acc) : nullptr); + args.push_back(with_scratchpad ? sc.memory(ih, *scratch_acc) + : nullptr); + args.push_back(pd()->impl_->using_transformed_filter() + ? sc.memory(ih, *filter_scratch_acc) + : nullptr); + pd()->impl_->execute(handle, args); + }); + }); +} + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl diff --git a/src/gpu/nvidia/cudnn_convolution.hpp b/src/gpu/nvidia/cudnn_convolution.hpp new file mode 100644 index 00000000000..455229e6b79 --- /dev/null +++ b/src/gpu/nvidia/cudnn_convolution.hpp @@ -0,0 +1,333 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_NVIDIA_CUDNN_CONVOLUTION_HPP +#define GPU_NVIDIA_CUDNN_CONVOLUTION_HPP + +#include "cudnn.h" + +#include "common/c_types_map.hpp" +#include "common/primitive.hpp" +#include "common/primitive_desc.hpp" +#include "gpu/nvidia/cudnn_convolution_impl.hpp" +#include "gpu/nvidia/cudnn_convolution_pd.hpp" +#include "gpu/nvidia/sycl_cuda_engine.hpp" +#include "gpu/nvidia/sycl_cuda_utils.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +struct cudnn_convolution_fwd_t : public primitive_t { + + struct pd_t : public cudnn_convolution_fwd_pd_t { + using cudnn_convolution_fwd_pd_t::cudnn_convolution_fwd_pd_t; + pd_t(const pd_t &other) + : cudnn_convolution_fwd_pd_t(other) + , impl_(other.impl_) + , use_temp_dst_(other.use_temp_dst_) + , dst_md_temp_(other.dst_md_temp_) {} + + DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_convolution_fwd_t); + + status_t init(engine_t *engine) { + using namespace data_type; + + const auto attr_skip_mask = primitive_attr_t::skip_mask_t::oscale + | primitive_attr_t::skip_mask_t::post_ops; + + bool ok = utils::one_of(desc()->prop_kind, + prop_kind::forward_training, prop_kind::forward_inference); + ok = ok && attr()->has_default_values(attr_skip_mask); + ok = ok && post_ops_ok(attr()); + ok = ok + && (utils::everyone_is(f32, src_md_.data_type, + weights_md_.data_type, dst_md_.data_type) + || utils::everyone_is(f16, src_md_.data_type, + weights_md_.data_type, dst_md_.data_type) + || (utils::everyone_is(s8, src_md_.data_type, + weights_md_.data_type) + && utils::one_of( + dst_md_.data_type, f32, s8))); + ok = ok && this->set_default_formats(); + ok = ok + && IMPLICATION( + desc()->alg_kind == dnnl_convolution_winograd, + ndims() < 5 && src_md_.data_type != s8); + ok = ok + && IMPLICATION(!attr()->output_scales_.has_default_values(), + src_md_.data_type == s8 + && attr()->output_scales_.mask_ == 0); + ok = ok + && IMPLICATION( + src_md_.data_type == s8, check_s8_configuration()); + ok = ok && memory_format_ok(&src_md_); + ok = ok && memory_format_ok(&weights_md_); + ok = ok && memory_format_ok(&dst_md_); + if (with_bias()) ok = ok && memory_format_ok(&bias_md_); + if (!ok) return status::unimplemented; + + if (check_for_zero_dims()) return status::success; + + if (use_temp_dst_) { + dst_md_temp_ = dst_md_; + if (dst_md_.data_type == s8) { dst_md_temp_.data_type = f32; } + } + + impl_.reset(new cudnn_convolution_impl_fwd_t()); + return impl_->init(engine, this, use_temp_dst_); + } + bool with_scratchpad() const { return impl_->with_scratchpad(); } + std::shared_ptr impl_; + bool use_temp_dst_ = attr()->post_ops_.len() > 0; + memory_desc_t dst_md_temp_; + + private: + bool set_default_formats() { + using namespace format_tag; + if (src_md_.data_type == dnnl_s8) { + auto dat_tag = utils::pick(ndims() - 3, nwc, nhwc, ndhwc); + auto wei_tag = with_groups() + ? utils::pick(ndims() - 3, gowi, gohwi, godhwi) + : utils::pick(ndims() - 3, owi, ohwi, odhwi); + return set_default_formats_common(dat_tag, wei_tag, dat_tag); + } else { + auto dat_tag = utils::pick(ndims() - 3, ncw, nchw, ncdhw); + auto wei_tag = with_groups() + ? utils::pick(ndims() - 3, goiw, goihw, goidhw) + : utils::pick(ndims() - 3, oiw, oihw, oidhw); + return set_default_formats_common(dat_tag, wei_tag, dat_tag); + } + } + + bool post_ops_ok(const primitive_attr_t *attr) const { + const auto &p = attr->post_ops_; + auto is_eltwise + = [&](int idx) { return p.entry_[idx].is_eltwise(false); }; + auto is_sum = [&](int idx) { return p.entry_[idx].is_sum(false); }; + + switch (p.len()) { + case 0: return true; // no post_ops + case 1: return is_eltwise(0) || is_sum(0); // sum OR eltwise + case 2: + if (src_md_.data_type == dnnl_s8 && is_eltwise(0) + && is_sum(1)) + return true; + return (is_sum(0) && is_eltwise(1)); + default: return false; + } + + return false; + } + + bool check_s8_configuration() const { + const auto check_nhwc = [](const dnnl_memory_desc_t &md, + bool is_weights = false) { + cudnnTensorFormat_t fmt; + get_format(&md, fmt, is_weights); + return fmt == CUDNN_TENSOR_NHWC; + }; + + return check_nhwc(src_md_) && check_nhwc(dst_md_) + && check_nhwc(weights_md_, true) + && (src_md_.dims[1] % 4) == 0 && (dst_md_.dims[1] % 4) == 0 + && ndims() < 5; + } + }; + + cudnn_convolution_fwd_t(const pd_t *apd) : primitive_t(apd) {} + + status_t init_temp_dst(engine_t *engine) { + auto sycl_engine = utils::downcast(engine); + memory_storage_t *scratch_ptr = nullptr; + auto wrap = memory_desc_wrapper(pd()->dst_md_temp_); + CHECK(sycl_engine->create_memory_storage( + &scratch_ptr, memory_flags_t::alloc, wrap.size(), nullptr)); + scratch_storage.reset(scratch_ptr); + + CHECK(sycl_engine->create_memory_storage( + &scratch_ptr, memory_flags_t::alloc, wrap.size(), nullptr)); + scratch_storage_2.reset(scratch_ptr); + + return status::success; + } + + virtual status_t init(engine_t *engine) { + if (pd()->use_temp_dst_) { init_temp_dst(engine); } + return status::success; + } + + status_t execute(const exec_ctx_t &ctx) const override { + if (pd()->check_for_zero_dims()) { return status::success; } + + execute_convolution(ctx, pd()->with_bias(), pd()->with_scratchpad()); + + return status::success; + } + status_t execute_convolution( + const exec_ctx_t &ctx, bool with_bias, bool with_scratchpad) const; + +private: + cl::sycl::buffer &buffer(memory_storage_t *mem_storage) const { + return utils::downcast( + mem_storage) + ->buffer(); + } + const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } + std::shared_ptr scratch_storage; + std::shared_ptr scratch_storage_2; +}; + +struct cudnn_convolution_bwd_data_t : public primitive_t { + + struct pd_t : public cudnn_convolution_bwd_data_pd_t { + using cudnn_convolution_bwd_data_pd_t::cudnn_convolution_bwd_data_pd_t; + + DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_convolution_bwd_data_t); + + status_t init(engine_t *engine) { + using namespace data_type; + bool ok = desc()->prop_kind == prop_kind::backward_data; + ok = ok && this->set_default_formats(); + ok = ok + && (utils::everyone_is(f32, diff_src_md_.data_type, + weights_md_.data_type, diff_dst_md_.data_type) + || utils::everyone_is(f16, diff_src_md_.data_type, + weights_md_.data_type, + diff_dst_md_.data_type)); + + ok = ok + && IMPLICATION( + desc()->alg_kind == dnnl_convolution_winograd, + ndims() < 5); + ok = ok && memory_format_ok(&diff_src_md_); + ok = ok && memory_format_ok(&weights_md_); + ok = ok && memory_format_ok(&diff_dst_md_); + if (with_bias()) { + ok = ok && memory_format_ok(&bias_md_); + ok = ok && bias_md_.data_type == diff_dst_md_.data_type; + } + if (!ok) return status::unimplemented; + + if (check_for_zero_dims()) return status::success; + + impl_.reset(new cudnn_convolution_impl_bwd_data_t()); + return impl_->init(engine, this); + } + + std::shared_ptr impl_; + + bool set_default_formats() { + using namespace format_tag; + auto dat_tag = utils::pick(ndims() - 3, ncw, nchw, ncdhw); + auto wei_tag = with_groups() + ? utils::pick(ndims() - 3, goiw, goihw, goidhw) + : utils::pick(ndims() - 3, oiw, oihw, oidhw); + return set_default_formats_common(dat_tag, wei_tag, dat_tag); + } + bool with_scratchpad() const { return impl_->with_scratchpad(); } + bool support_bias() const override { return true; } + }; + + cudnn_convolution_bwd_data_t(const pd_t *apd) : primitive_t(apd) {} + ~cudnn_convolution_bwd_data_t() {} + status_t execute(const exec_ctx_t &ctx) const override { + if (pd()->check_for_zero_dims()) { return status::success; } + return execute_convolution( + ctx, pd()->with_bias(), pd()->with_scratchpad()); + } + status_t execute_convolution( + const exec_ctx_t &ctx, bool with_bias, bool with_scratchpad) const; + +private: + const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } +}; + +struct cudnn_convolution_bwd_weights_t : public primitive_t { + + struct pd_t : public cudnn_convolution_bwd_weights_pd_t { + using cudnn_convolution_bwd_weights_pd_t:: + cudnn_convolution_bwd_weights_pd_t; + + DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_convolution_bwd_weights_t); + + status_t init(engine_t *engine) { + using namespace data_type; + bool ok = desc()->prop_kind == prop_kind::backward_weights; + ok = ok && this->set_default_formats(); + ok = ok + && (utils::everyone_is(f32, src_md_.data_type, + diff_weights_md_.data_type, + diff_dst_md_.data_type) + || utils::everyone_is(f16, src_md_.data_type, + diff_weights_md_.data_type, + diff_dst_md_.data_type)); + + ok = ok + && IMPLICATION( + desc()->alg_kind == dnnl_convolution_winograd, + ndims() < 5); + ok = ok && memory_format_ok(&src_md_); + ok = ok && memory_format_ok(&diff_weights_md_); + ok = ok && memory_format_ok(&diff_dst_md_); + if (with_bias()) { + ok = ok && memory_format_ok(&diff_bias_md_); + ok = ok && diff_bias_md_.data_type == diff_dst_md_.data_type; + } + if (!ok) return status::unimplemented; + + impl_.reset(new cudnn_convolution_impl_bwd_weights_t()); + if (check_for_zero_dims()) { return impl_->init_zero_dims(this); }; + + return impl_->init(engine, this); + } + + std::shared_ptr impl_; + + bool set_default_formats() { + using namespace format_tag; + auto dat_tag = utils::pick(ndims() - 3, ncw, nchw, ncdhw); + auto wei_tag = with_groups() + ? utils::pick(ndims() - 3, goiw, goihw, goidhw) + : utils::pick(ndims() - 3, oiw, oihw, oidhw); + return set_default_formats_common(dat_tag, wei_tag, dat_tag); + } + bool with_scratchpad() const { return impl_->with_scratchpad(); } + }; + + cudnn_convolution_bwd_weights_t(const pd_t *apd) : primitive_t(apd) {} + ~cudnn_convolution_bwd_weights_t() {} + status_t execute(const exec_ctx_t &ctx) const override { + if (pd()->check_for_zero_dims()) { return execute_zero_dims(ctx); } + return execute_convolution( + ctx, pd()->with_bias(), pd()->with_scratchpad()); + } + status_t execute_convolution( + const exec_ctx_t &ctx, bool with_bias, bool with_scratchpad) const; + status_t execute_zero_dims(const exec_ctx_t &ctx) const; + +private: + const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } +}; + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/nvidia/cudnn_convolution_impl.hpp b/src/gpu/nvidia/cudnn_convolution_impl.hpp new file mode 100644 index 00000000000..fad83e94dc0 --- /dev/null +++ b/src/gpu/nvidia/cudnn_convolution_impl.hpp @@ -0,0 +1,900 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_NVIDIA_CUDNN_CONVOLUTION_IMPL_HPP +#define GPU_NVIDIA_CUDNN_CONVOLUTION_IMPL_HPP + +#include "cudnn.h" + +#include "common/c_types_map.hpp" +#include "common/convolution_pd.hpp" +#include "gpu/nvidia/cudnn_conv_filter_adjustment_base.hpp" +#include "gpu/nvidia/cudnn_convolution_pd.hpp" +#include "gpu/nvidia/sycl_cuda_engine.hpp" +#include "gpu/nvidia/sycl_cuda_stream.hpp" +#include "gpu/nvidia/sycl_cuda_utils.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +struct cudnn_convolution_impl_base_t + : public cudnn_conv_filter_adjustment_base_t { +protected: + enum io { x = 0, bias, weights, y, NUM_IO }; + memory_desc_t dnnl_descs[NUM_IO]; + cudnnConvolutionDescriptor_t conv_desc; + int padding[CUDNN_DIM_MAX]; + int dilation[CUDNN_DIM_MAX]; + cudnnTensorDescriptor_t descs[NUM_IO]; + cudnnDataType_t data_types[NUM_IO]; + int ndims[NUM_IO]; + int dims[NUM_IO][DNNL_MAX_NDIMS]; + int strides[NUM_IO + 1][DNNL_MAX_NDIMS]; + int filter_strides[DNNL_MAX_NDIMS]; + cudnnTensorFormat_t formats[NUM_IO]; + bool filter_needs_transform = false; + cudnnFilterDescriptor_t weights_desc; + float alpha = 0.f; + float beta = 0.f; + int group_count = 1; + bool with_groups = false; + size_t scratchpad_size = 0; + bool with_bias = false; + + bool do_scaling = false; + float output_scaling = 1.0f; + cudnnDataType_t computation_data_type = CUDNN_DATA_FLOAT; + cudnnDataType_t reorder_type = CUDNN_DATA_INT8; + +public: + virtual ~cudnn_convolution_impl_base_t() { + CUDNN_EXECUTE_FUNC_V(cudnnDestroyFilterDescriptor, weights_desc); + CUDNN_EXECUTE_FUNC_V(cudnnDestroyConvolutionDescriptor, conv_desc); + for (size_t i = 0; i < io::NUM_IO; i++) { + CUDNN_EXECUTE_FUNC_V(cudnnDestroyTensorDescriptor, descs[i]); + } + } + virtual status_t configure_alg_kind(engine_t *, convolution_pd_t *pd) = 0; + + virtual bool supported_filter_format(const memory_desc_t *md) const { + const memory_desc_wrapper mem_wrapper(md); + + return (mem_wrapper.matches_one_of_tag(format_tag::ab, format_tag::abc, + format_tag::abcd, format_tag::abcde, format_tag::abcdef) + || (with_groups ? mem_wrapper.matches_one_of_tag( + format_tag::gowi, format_tag::gohwi, + format_tag::godhwi) + : mem_wrapper.matches_one_of_tag( + format_tag::owi, format_tag::ohwi, + format_tag::odhwi))); + } + + bool using_transformed_filter() const { return filter_needs_transform; } + bool with_scratchpad() const { return scratchpad_size > 0; } + + virtual status_t init(engine_t *engine, convolution_pd_t *pd, + bool use_scratch_dst = false) { + CHECK(configure_parameters(pd, use_scratch_dst)); + CHECK(create_cudnn_descs(pd)); + CHECK(check_output_dims()); + CHECK(configure_alg_kind(engine, pd)); + CHECK(init_scratchpad(engine, pd)); + + return status::success; + } + + virtual status_t init_zero_dims(convolution_pd_t *pd) { + return status::success; + } + void get_dims_and_strides(int io) { + convert_dims( + dnnl_descs[io].dims, dims[io], dnnl_descs[io].ndims, ndims[io]); + if (ndims[io] > dnnl_descs[io].ndims) { + std::swap(dims[io][ndims[io] - 1], dims[io][ndims[io] - 2]); + if (ndims[io] == 4) { + if (formats[io] == CUDNN_TENSOR_NHWC) { + propagate_strides(strides[io], dims[io], {1, 3, 2, 0}); + } else { + propagate_strides(strides[io], dims[io], {3, 2, 1, 0}); + } + } + } else { + convert_dims(dnnl_descs[io].format_desc.blocking.strides, + strides[io], dnnl_descs[io].ndims, ndims[io]); + } + } + status_t configure_parameters( + const convolution_pd_t *pd, bool use_scratch_dst) { + if (pd->ndims() > CUDNN_DIM_MAX) { return status::invalid_arguments; } + CHECK(set_padding_and_dilation(pd)); + with_groups = pd->with_groups(); + with_bias = pd->with_bias(); + alpha = 1.0f; + beta = 0.0f; + output_scaling = pd->attr()->output_scales_.scales_[0]; + do_scaling = output_scaling != 1.f; + dnnl_descs[x] = *pd->invariant_src_md(); + dnnl_descs[weights] = *pd->invariant_wei_md(); + dnnl_descs[y] = *pd->invariant_dst_md(); + if (with_bias) dnnl_descs[bias] = *pd->invariant_bia_md(); + + ndims[x] = std::max(dnnl_descs[x].ndims, 4); + ndims[weights] = std::max(dnnl_descs[weights].ndims, 4 + with_groups); + ndims[y] = std::max(dnnl_descs[y].ndims, 4); + + CHECK(convert_data_type(&dnnl_descs[x], &data_types[x])); + CHECK(convert_data_type(&dnnl_descs[weights], &data_types[weights])); + CHECK(convert_data_type(&dnnl_descs[y], &data_types[y])); + + CHECK(get_formats()); + set_compute_format(); + get_dims_and_strides(x); + get_dims_and_strides(weights); + get_dims_and_strides(y); + + if (!supported_filter_format(&dnnl_descs[weights])) { + set_filter_format( + ndims[weights], dims[weights], strides[NUM_IO], formats[x]); + CHECK(init_filter_transformation(data_types[weights], + ndims[weights], dims[weights], strides[weights], + strides[NUM_IO])); + filter_needs_transform = true; + // we transform the filter based on src format + formats[weights] = formats[x]; + } else { + CHECK(get_filter_format()); + get_dims_and_strides(weights); + } + if (with_groups) { + dims[weights][1] *= pd->G(); + ndims[weights] = std::max(4, ndims[weights] - with_groups); + } + + if (with_bias) { + ndims[bias] = dnnl_descs[bias].ndims; + CHECK(convert_data_type(&dnnl_descs[bias], &data_types[bias])); + convert_dims( + dnnl_descs[bias].dims, dims[bias], ndims[bias], ndims[y]); + std::swap(dims[bias][0], dims[bias][1]); + convert_dims(dnnl_descs[bias].format_desc.blocking.strides, + strides[bias], ndims[bias], ndims[y]); + ndims[bias] = ndims[y]; + } + + return status::success; + } + + status_t create_cudnn_descs(const convolution_pd_t *pd) { + CHECK(create_and_set_convolution_desc(pd)); + CHECK(create_and_set_tensor_descriptor( + &descs[x], data_types[x], ndims[x], dims[x], strides[x])); + CHECK(create_and_set_filter_descriptor(&weights_desc, formats[weights], + data_types[weights], ndims[weights], + dims[weights] + with_groups, strides[weights])); + CHECK(create_and_set_tensor_descriptor( + &descs[y], data_types[y], ndims[y], dims[y], strides[y])); + + if (with_bias) { + CHECK(create_and_set_tensor_descriptor(&descs[bias], + data_types[bias], ndims[bias], dims[bias], strides[bias])); + } + + return status::success; + } + virtual status_t init_scratchpad(engine_t *engine, convolution_pd_t *pd) { + if (filter_needs_transform) { + auto sz = memory_desc_wrapper(&dnnl_descs[weights]).size(); + auto data_size + = types::data_type_size(pd->invariant_wei_md(0)->data_type); + pd->scratchpad_registry().registrar().book( + memory_tracking::names::key_conv_cudnn_filter, sz, + data_size); + } + return status::success; + }; + + status_t create_and_set_convolution_desc(const convolution_pd_t *pd) { + CUDNN_EXECUTE_FUNC_V(cudnnCreateConvolutionDescriptor, &conv_desc); + CUDNN_EXECUTE_FUNC_V(cudnnSetConvolutionNdDescriptor, conv_desc, + ndims[x] - 2, padding, filter_strides, dilation, + cudnnConvolutionMode_t::CUDNN_CROSS_CORRELATION, + computation_data_type); + // Check for groups and set group count if necessary + if (with_groups) { + group_count = pd->G(); + if (group_count > 1) + CHECK(CUDNN_EXECUTE_FUNC_S( + cudnnSetConvolutionGroupCount, conv_desc, group_count)); + } + return status::success; + } + + status_t set_padding_and_dilation(const convolution_pd_t *pd) { + int actual_ndims = pd->ndims(); + if (actual_ndims == 3) { + padding[0] = 0; + padding[1] = static_cast(pd->padL()); + dilation[0] = 1; + dilation[1] = static_cast(pd->KDW() + 1); + + filter_strides[0] = 1; + filter_strides[1] = static_cast(pd->KSW()); + } else if (actual_ndims == 4) { + padding[0] = static_cast(pd->padT()); + padding[1] = static_cast(pd->padL()); + + dilation[0] = static_cast(pd->KDH() + 1); + dilation[1] = static_cast(pd->KDW() + 1); + + filter_strides[0] = static_cast(pd->KSH()); + filter_strides[1] = static_cast(pd->KSW()); + } else { + padding[0] = static_cast(pd->padFront()); + padding[1] = static_cast(pd->padT()); + padding[2] = static_cast(pd->padL()); + + dilation[0] = static_cast(pd->KDD() + 1); + dilation[1] = static_cast(pd->KDH() + 1); + dilation[2] = static_cast(pd->KDW() + 1); + + filter_strides[0] = static_cast(pd->KSD()); + filter_strides[1] = static_cast(pd->KSH()); + filter_strides[2] = static_cast(pd->KSW()); + } + return status::success; + } + + virtual void execute( + cudnnHandle_t handle, const std::vector &args) const = 0; + + void execute_sum(cudnnHandle_t handle, void *x, void *y, float alpha_, + float beta_) const { + float alpha = alpha_; + float beta = beta_; + CUDNN_EXECUTE_FUNC_V(cudnnAddTensor, handle, &alpha, descs[io::y], x, + &beta, descs[io::y], y); + } + + void execute_scale(cudnnHandle_t handle, void *y) const { + if (do_scaling) { + CUDNN_EXECUTE_FUNC_V( + cudnnScaleTensor, handle, descs[io::y], y, &output_scaling); + } + } + + void execute_set_weights_bias( + cudnnHandle_t handle, void *weights, void *bias, float value) { + CUDNN_EXECUTE_FUNC_V( + cudnnSetTensor, handle, descs[io::weights], weights, &value); + if (bias) { + CUDNN_EXECUTE_FUNC_V( + cudnnSetTensor, handle, descs[io::bias], bias, &value); + } + } + + bool with_eltwise(const convolution_pd_t *pd, int position) const { + return pd->attr()->post_ops_.contain(primitive_kind::eltwise, position); + } + + status_t check_output_dims() const { + int expected_dims[CUDNN_DIM_MAX] = {}; + CUDNN_EXECUTE_FUNC_V(cudnnGetConvolutionNdForwardOutputDim, conv_desc, + descs[x], weights_desc, ndims[y], &expected_dims[0]); + for (size_t i = 0; i < ndims[y]; i++) { + if (dims[y][i] != expected_dims[i]) return status::unimplemented; + } + return status::success; + } + + void set_compute_format() { + if (data_types[x] == CUDNN_DATA_INT8) { + computation_data_type = CUDNN_DATA_INT32; + } else { + computation_data_type = data_types[y]; + } + } + + status_t get_filter_format() { + memory_desc_wrapper wrapper(&dnnl_descs[weights]); + if (wrapper.matches_one_of_tag(format_tag::ab, format_tag::abc, + format_tag::abcd, format_tag::abcde, format_tag::abcdef)) { + formats[weights] = cudnnTensorFormat_t::CUDNN_TENSOR_NCHW; + } else if ((!with_groups + && wrapper.matches_one_of_tag(format_tag::owi, + format_tag::ohwi, format_tag::odhwi)) + || (with_groups + && wrapper.matches_one_of_tag(format_tag::gowi, + format_tag::gohwi, format_tag::godhwi))) { + formats[weights] = cudnnTensorFormat_t::CUDNN_TENSOR_NHWC; + } else { + return status::unimplemented; + } + + return status::success; + } + + status_t get_formats() { + CHECK(get_format(&dnnl_descs[x], formats[x])); + CHECK(get_format(&dnnl_descs[y], formats[y])); + return status::success; + } + + void set_filter_nhwc(int filter_ndims, int *transform_filter_strides, + int *filter_dims) override { + if (with_groups) { + switch (filter_ndims) { + case 4: // Convert to krsc + return propagate_strides(transform_filter_strides, + filter_dims, {2, 3, 1, 0}); + case 5: + return propagate_strides(transform_filter_strides, + filter_dims, {2, 4, 3, 1, 0}); + case 6: + return propagate_strides(transform_filter_strides, + filter_dims, {2, 5, 4, 3, 1, 0}); + } + } else { + cudnn_conv_filter_adjustment_base_t::set_filter_nhwc( + filter_ndims, transform_filter_strides, filter_dims); + } + } +}; + +struct cudnn_convolution_impl_fwd_t : public cudnn_convolution_impl_base_t { +protected: + cudnnActivationDescriptor_t activation_desc = nullptr; + cudnnActivationDescriptor_t eltwise_desc = nullptr; + cudnnTensorDescriptor_t reorder_dst_desc = nullptr; + cudnnConvolutionFwdAlgo_t fwd_alg_kind; + std::vector perf; + int requested_algo_count = 0; + int returned_algo_count = 0; + int num_post_ops = 0; + primitive_kind_t post_ops[2]; + bool need_reorder = false; + bool use_temp_dst = false; + float sum_scale = 1.0f; + +public: + virtual ~cudnn_convolution_impl_fwd_t() { + if (activation_desc) + CUDNN_EXECUTE_FUNC_V( + cudnnDestroyActivationDescriptor, activation_desc); + if (eltwise_desc) + CUDNN_EXECUTE_FUNC_V( + cudnnDestroyActivationDescriptor, eltwise_desc); + if (reorder_dst_desc) + CUDNN_EXECUTE_FUNC_V( + cudnnDestroyTensorDescriptor, reorder_dst_desc); + } + + status_t configure_post_ops(convolution_pd_t *pd) { + auto &p = pd->attr()->post_ops_; + num_post_ops = p.len(); + if (data_types[y] == CUDNN_DATA_INT8 && p.len() > 0) { + data_types[y] = CUDNN_DATA_FLOAT; + need_reorder = true; + } + for (size_t i = 0; i < p.len(); i++) { + post_ops[i] = p.entry_[i].kind; + if (post_ops[i] == dnnl_eltwise) { + create_and_set_eltwise_descriptor(pd); + } + if (post_ops[i] == dnnl_sum) { sum_scale = p.entry_[i].sum.scale; } + } + + if (need_reorder) + CHECK(create_and_set_tensor_descriptor_ex(&reorder_dst_desc, + formats[y], reorder_type, ndims[y], dims[y])); + + return status::success; + } + + status_t init(engine_t *engine, convolution_pd_t *pd, + bool use_scratch_dst) override { + use_temp_dst = use_scratch_dst; + CHECK(configure_parameters(pd, use_temp_dst)); + CHECK(configure_post_ops(pd)); + CHECK(create_cudnn_descs(pd)); + CHECK(configure_alg_kind(engine, pd)); + CHECK(init_scratchpad(engine, pd)); + + return status::success; + } + + void execute_reorder(cudnnHandle_t handle, void *src, void *dst, + bool flip_formats) const { + const float alpha = 1.0f; + const float beta = 0.0f; + if (flip_formats) { + CUDNN_EXECUTE_FUNC_V(cudnnTransformTensor, handle, &alpha, + reorder_dst_desc, src, &beta, descs[y], dst); + } else { + CUDNN_EXECUTE_FUNC_V(cudnnTransformTensor, handle, &alpha, descs[y], + src, &beta, reorder_dst_desc, dst); + } + } + + void execute_eltwise(cudnnHandle_t handle, void *src, void *dst) const { + float alpha = 1.0f; + float beta = 0.0f; + CUDNN_EXECUTE_FUNC_V(cudnnActivationForward, handle, eltwise_desc, + &alpha, descs[io::y], src, &beta, descs[io::y], dst); + } + + void execute(cudnnHandle_t handle, + const std::vector &args) const override { + auto x = args[0], weights = args[1], y = args[2], bias = args[3], + scratchpad = args[4], post_op_scratch = args[6], + post_op_reorder = args[7]; + void *output = use_temp_dst ? post_op_scratch : y; + if (using_transformed_filter()) { + auto w_scratch = args[5]; + transform_filter(handle, weights, w_scratch); + weights = w_scratch; + } + if (computation_data_type == CUDNN_DATA_INT32 && bias) { + CUDNN_EXECUTE_FUNC_V(cudnnConvolutionBiasActivationForward, handle, + &alpha, descs[io::x], x, weights_desc, weights, conv_desc, + fwd_alg_kind, scratchpad, scratchpad_size, &beta, + descs[io::y], output, descs[io::bias], bias, + activation_desc, descs[io::y], output); + } else { + const float bias_alpha = 1.0f; + const float bias_beta = 1.0f; + CUDNN_EXECUTE_FUNC_V(cudnnConvolutionForward, handle, &alpha, + descs[io::x], x, weights_desc, weights, conv_desc, + fwd_alg_kind, scratchpad, scratchpad_size, &beta, + descs[io::y], output); + if (with_bias) { + CUDNN_EXECUTE_FUNC_V(cudnnAddTensor, handle, &bias_alpha, + descs[io::bias], bias, &bias_beta, descs[io::y], + output); + } + } + execute_scale(handle, output); + for (int i = 0; i < num_post_ops; i++) { + bool last_op = i == num_post_ops - 1 && !need_reorder; + if (last_op) output = y; + switch (post_ops[i]) { + case dnnl_sum: + if (need_reorder) { + execute_reorder(handle, y, post_op_reorder, true); + execute_sum(handle, post_op_reorder, post_op_scratch, + sum_scale, 1.0f); + } else if (last_op) { + execute_sum( + handle, post_op_scratch, y, 1.0f, sum_scale); + } else { + execute_sum( + handle, y, post_op_scratch, sum_scale, 1.0f); + } + + break; + + case dnnl_eltwise: + execute_eltwise(handle, post_op_scratch, output); + break; + } + } + + if (need_reorder) { + execute_reorder(handle, post_op_scratch, y, false); + } + } + status_t init_scratchpad(engine_t *engine, convolution_pd_t *pd) { + auto &sycl_engine = *utils::downcast(engine); + stream_t *service_stream; + CHECK(sycl_engine.get_service_stream(service_stream)); + + auto cuda_stream + = utils::downcast(service_stream); + auto handle = cuda_stream->get_cudnn_handle(); + + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnGetConvolutionForwardWorkspaceSize, + handle, descs[x], weights_desc, conv_desc, descs[y], + fwd_alg_kind, &scratchpad_size)); + if (scratchpad_size > 0) + pd->scratchpad_registry().registrar().book( + memory_tracking::names::key_conv_cudnn_algo, + scratchpad_size, size_t(1)); + + return cudnn_convolution_impl_base_t::init_scratchpad(engine, pd); + } + status_t configure_alg_kind( + engine_t *engine, convolution_pd_t *pd) override { + auto &sycl_engine = *utils::downcast(engine); + stream_t *service_stream; + CHECK(sycl_engine.get_service_stream(service_stream)); + + auto cuda_stream + = utils::downcast(service_stream); + auto handle = cuda_stream->get_cudnn_handle(); + + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnGetConvolutionForwardAlgorithmMaxCount, + handle, &requested_algo_count)); + perf.resize(requested_algo_count); + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnFindConvolutionForwardAlgorithm, handle, + descs[x], weights_desc, conv_desc, descs[y], + requested_algo_count, &returned_algo_count, perf.data())); + for (size_t i = 0; i < returned_algo_count; i++) { + if (perf[i].status == CUDNN_STATUS_SUCCESS) { + // cudnnFindConvolutionForwardAlgorithm can erroneously report + // algorithms for int8 which does not work so ensure that we + // only allow CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM + // in this case. + if (computation_data_type == CUDNN_DATA_INT32 + && perf[i].algo + != CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) { + continue; + } + switch (pd->desc()->alg_kind) { + case dnnl_convolution_auto: + if (utils::one_of(perf[i].algo, + CUDNN_CONVOLUTION_FWD_ALGO_GEMM, + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM, + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM)) { + utils::downcast(pd) + ->set_alg_kind(dnnl_convolution_direct); + } else { + utils::downcast(pd) + ->set_alg_kind(dnnl_convolution_winograd); + } + break; + case dnnl_convolution_direct: + if (!utils::one_of(perf[i].algo, + CUDNN_CONVOLUTION_FWD_ALGO_GEMM, + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM, + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM)) + continue; + break; + case dnnl_convolution_winograd: + if (!utils::one_of(perf[i].algo, + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD, + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED)) + continue; + break; + default: return status::unimplemented; + } + fwd_alg_kind = perf[i].algo; + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetConvolutionMathType, + conv_desc, perf[i].mathType)); + break; + } else { + return status::unimplemented; + } + } + + if (fwd_alg_kind == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) { + CHECK(CUDNN_EXECUTE_FUNC_S( + cudnnCreateActivationDescriptor, &activation_desc)); + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetActivationDescriptor, + activation_desc, + cudnnActivationMode_t::CUDNN_ACTIVATION_IDENTITY, + CUDNN_NOT_PROPAGATE_NAN, 1.0)); + } + + return status::success; + } + + status_t create_and_set_eltwise_descriptor(const convolution_pd_t *pd) { + + CHECK(CUDNN_EXECUTE_FUNC_S( + cudnnCreateActivationDescriptor, &eltwise_desc)); + + cudnnActivationMode_t act_mode; + switch (eltwise_algorithm_kind(pd)) { + case alg_kind::eltwise_tanh: + act_mode = CUDNN_ACTIVATION_TANH; + break; + case alg_kind::eltwise_elu: act_mode = CUDNN_ACTIVATION_ELU; break; + case alg_kind::eltwise_relu: + act_mode = CUDNN_ACTIVATION_RELU; + break; + case alg_kind::eltwise_logistic: + act_mode = CUDNN_ACTIVATION_SIGMOID; + break; + case alg_kind::eltwise_bounded_relu: + act_mode = CUDNN_ACTIVATION_CLIPPED_RELU; + break; + default: return status::unimplemented; + } + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetActivationDescriptor, eltwise_desc, + act_mode, cudnnNanPropagation_t::CUDNN_NOT_PROPAGATE_NAN, + eltwise_alpha(pd))); + + return status::success; + } + + dnnl::impl::alg_kind_t eltwise_algorithm_kind( + const convolution_pd_t *pd) const { + const int eltwise_idx + = pd->attr()->post_ops_.find(primitive_kind::eltwise); + return pd->attr()->post_ops_.entry_[eltwise_idx].eltwise.alg; + } + + float eltwise_alpha(const convolution_pd_t *pd) const { + const int eltwise_idx + = pd->attr()->post_ops_.find(primitive_kind::eltwise); + return pd->attr()->post_ops_.entry_[eltwise_idx].eltwise.alpha; + } +}; + +struct cudnn_convolution_impl_bwd_data_t + : public cudnn_convolution_impl_base_t { +protected: + cudnnConvolutionBwdDataAlgo_t bwd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; + std::vector perf; + int requested_algo_count = 0; + int returned_algo_count = 0; + status_t configure_alg_kind( + engine_t *engine, convolution_pd_t *pd) override { + auto &sycl_engine = *utils::downcast(engine); + stream_t *service_stream; + CHECK(sycl_engine.get_service_stream(service_stream)); + + auto cuda_stream + = utils::downcast(service_stream); + auto handle = cuda_stream->get_cudnn_handle(); + + CHECK(CUDNN_EXECUTE_FUNC_S( + cudnnGetConvolutionBackwardDataAlgorithmMaxCount, handle, + &requested_algo_count)); + perf.resize(requested_algo_count); + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnFindConvolutionBackwardDataAlgorithm, + handle, weights_desc, descs[y], conv_desc, descs[x], + requested_algo_count, &returned_algo_count, perf.data())); + for (size_t i = 0; i < returned_algo_count; i++) { + if (perf[i].status == CUDNN_STATUS_SUCCESS) { + switch (pd->desc()->alg_kind) { + case dnnl_convolution_auto: + if (utils::one_of(perf[i].algo, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_0, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_1)) { + utils::downcast( + pd) + ->set_alg_kind(dnnl_convolution_direct); + } else { + utils::downcast( + pd) + ->set_alg_kind(dnnl_convolution_winograd); + } + break; + case dnnl_convolution_direct: + if (!utils::one_of(perf[i].algo, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_0, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_1)) + continue; + break; + case dnnl_convolution_winograd: + if (!utils::one_of(perf[i].algo, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED)) + continue; + break; + default: return status::unimplemented; + } + bwd_algo = perf[i].algo; + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetConvolutionMathType, + conv_desc, perf[i].mathType)); + break; + } else { + return status::unimplemented; + } + } + + return status::success; + } + + status_t init_scratchpad(engine_t *engine, convolution_pd_t *pd) override { + auto &sycl_engine = *utils::downcast(engine); + stream_t *service_stream; + CHECK(sycl_engine.get_service_stream(service_stream)); + + auto cuda_stream + = utils::downcast(service_stream); + auto handle = cuda_stream->get_cudnn_handle(); + + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnGetConvolutionBackwardDataWorkspaceSize, + handle, weights_desc, descs[io::y], conv_desc, descs[io::x], + bwd_algo, &scratchpad_size)); + if (scratchpad_size > 0) + pd->scratchpad_registry().registrar().book( + memory_tracking::names::key_conv_cudnn_algo, + scratchpad_size, size_t(1)); + + return cudnn_convolution_impl_base_t::init_scratchpad(engine, pd); + } + + void execute(cudnnHandle_t handle, + const std::vector &args) const override { + auto x = args[0], weights = args[1], y = args[2], bias = args[3], + scratchpad = args[4]; + if (using_transformed_filter()) { + auto w_scratch = args[5]; + transform_filter(handle, weights, w_scratch); + weights = w_scratch; + } + const float bias_alpha = 1.0f; + const float bias_beta = 1.0f; + CUDNN_EXECUTE_FUNC_V(cudnnConvolutionBackwardData, handle, &alpha, + weights_desc, weights, descs[io::y], y, conv_desc, bwd_algo, + scratchpad, scratchpad_size, &beta, descs[io::x], x); + if (with_bias) { + CUDNN_EXECUTE_FUNC_V(cudnnAddTensor, handle, &bias_alpha, + descs[io::bias], bias, &bias_beta, descs[io::x], x); + } + } +}; + +struct cudnn_convolution_impl_bwd_weights_t + : public cudnn_convolution_impl_base_t { +protected: + cudnnConvolutionBwdFilterAlgo_t bwd_filter_algo + = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; + std::vector perf; + int requested_algo_count = 0; + int returned_algo_count = 0; + +public: + status_t init_zero_dims(convolution_pd_t *pd) override { + if (pd->ndims() > CUDNN_DIM_MAX) { return status::invalid_arguments; } + dnnl_descs[weights] = *pd->invariant_wei_md(); + CHECK(get_format(&dnnl_descs[weights], formats[weights], true)); + ndims[y] = pd->invariant_dst_md()->ndims; + ndims[weights] = dnnl_descs[weights].ndims - pd->with_groups(); + CHECK(convert_data_type(&dnnl_descs[weights], &data_types[weights])); + convert_dims(dnnl_descs[weights].dims + pd->with_groups(), + dims[weights], ndims[weights]); + ndims[weights] = std::max(4, ndims[weights]); + convert_dims(dnnl_descs[weights].format_desc.blocking.strides, + strides[weights], ndims[weights]); + CHECK(create_and_set_tensor_descriptor(&descs[weights], + data_types[weights], ndims[weights], dims[weights], + strides[weights])); + + if (pd->with_bias()) { + dnnl_descs[bias] = *pd->invariant_bia_md(); + ndims[bias] = dnnl_descs[bias].ndims; + CHECK(convert_data_type(&dnnl_descs[bias], &data_types[bias])); + convert_dims(dnnl_descs[bias].padded_dims, dims[bias], ndims[bias], + ndims[y]); + std::swap(dims[bias][0], dims[bias][1]); + convert_dims(dnnl_descs[bias].format_desc.blocking.strides, + strides[bias], ndims[bias], ndims[weights]); + ndims[bias] = ndims[y]; + CHECK(create_and_set_tensor_descriptor(&descs[bias], + data_types[bias], ndims[bias], dims[bias], strides[bias])); + } + return status::success; + } + virtual status_t configure_alg_kind( + engine_t *engine, convolution_pd_t *pd) { + auto &sycl_engine = *utils::downcast(engine); + stream_t *service_stream; + CHECK(sycl_engine.get_service_stream(service_stream)); + + auto cuda_stream + = utils::downcast(service_stream); + auto handle = cuda_stream->get_cudnn_handle(); + + CHECK(CUDNN_EXECUTE_FUNC_S( + cudnnGetConvolutionBackwardFilterAlgorithmMaxCount, handle, + &requested_algo_count)); + perf.resize(requested_algo_count); + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnFindConvolutionBackwardFilterAlgorithm, + handle, descs[x], descs[y], conv_desc, weights_desc, + requested_algo_count, &returned_algo_count, perf.data())); + for (size_t i = 0; i < returned_algo_count; i++) { + if (perf[i].status == CUDNN_STATUS_SUCCESS) { + switch (pd->desc()->alg_kind) { + case dnnl_convolution_auto: + if (utils::one_of(perf[i].algo, + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0, + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3)) { + utils::downcast< + cudnn_convolution_bwd_weights_pd_t *>(pd) + ->set_alg_kind(dnnl_convolution_direct); + } else { + utils::downcast< + cudnn_convolution_bwd_weights_pd_t *>(pd) + ->set_alg_kind(dnnl_convolution_winograd); + } + break; + case dnnl_convolution_direct: + if (!utils::one_of(perf[i].algo, + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0, + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3)) + continue; + break; + case dnnl_convolution_winograd: + if (!utils::one_of(perf[i].algo, + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD, + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED)) + continue; + break; + default: return status::unimplemented; + } + bwd_filter_algo = perf[i].algo; + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetConvolutionMathType, + conv_desc, perf[i].mathType)); + break; + } else { + return status::unimplemented; + } + } + + return status::success; + } + + status_t init_scratchpad(engine_t *engine, convolution_pd_t *pd) override { + auto &sycl_engine = *utils::downcast(engine); + stream_t *service_stream; + CHECK(sycl_engine.get_service_stream(service_stream)); + + auto cuda_stream + = utils::downcast(service_stream); + auto handle = cuda_stream->get_cudnn_handle(); + + CHECK(CUDNN_EXECUTE_FUNC_S( + cudnnGetConvolutionBackwardFilterWorkspaceSize, handle, + descs[io::x], descs[io::y], conv_desc, weights_desc, + bwd_filter_algo, &scratchpad_size)); + if (scratchpad_size > 0) + pd->scratchpad_registry().registrar().book( + memory_tracking::names::key_conv_cudnn_algo, + scratchpad_size, size_t(1)); + + return cudnn_convolution_impl_base_t::init_scratchpad(engine, pd); + } + + void execute(cudnnHandle_t handle, + const std::vector &args) const override { + auto x = args[0], weights = args[1], y = args[2], bias = args[3], + scratchpad = args[4]; + auto filter = weights; + if (using_transformed_filter()) { + auto w_scratch = args[5]; + transform_filter(handle, weights, w_scratch); + filter = w_scratch; + } + const float bias_alpha = 1.0f; + const float bias_beta = 0.0f; + CUDNN_EXECUTE_FUNC_V(cudnnConvolutionBackwardFilter, handle, &alpha, + descs[io::x], x, descs[io::y], y, conv_desc, bwd_filter_algo, + scratchpad, scratchpad_size, &beta, weights_desc, filter); + if (with_bias) { + CUDNN_EXECUTE_FUNC_V(cudnnConvolutionBackwardBias, handle, + &bias_alpha, descs[io::y], y, &bias_beta, descs[io::bias], + bias); + } + if (using_transformed_filter()) { + undo_transform_filter(handle, filter, weights); + } + } +}; + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/nvidia/cudnn_convolution_pd.hpp b/src/gpu/nvidia/cudnn_convolution_pd.hpp new file mode 100644 index 00000000000..400d2e8327e --- /dev/null +++ b/src/gpu/nvidia/cudnn_convolution_pd.hpp @@ -0,0 +1,77 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_NVIDIA_CUDNN_CONVOLUTION_PD_HPP +#define GPU_NVIDIA_CUDNN_CONVOLUTION_PD_HPP + +#include "common/convolution_pd.hpp" +#include "gpu/nvidia/sycl_cuda_utils.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +struct cudnn_convolution_fwd_pd_t : public convolution_fwd_pd_t { + using convolution_fwd_pd_t::convolution_fwd_pd_t; + + bool set_alg_kind(alg_kind_t kind) { return set_default_alg_kind(kind); } + + bool check_for_zero_dims() const { + return has_zero_dims( + invariant_src_md()->dims, invariant_src_md()->ndims) + || has_zero_dims( + invariant_wei_md(0)->dims, invariant_wei_md(0)->ndims) + || has_zero_dims( + invariant_dst_md()->dims, invariant_dst_md()->ndims); + } +}; +struct cudnn_convolution_bwd_data_pd_t : public convolution_bwd_data_pd_t { + using convolution_bwd_data_pd_t::convolution_bwd_data_pd_t; + + bool set_alg_kind(alg_kind_t kind) { return set_default_alg_kind(kind); } + + bool check_for_zero_dims() const { + return has_zero_dims( + invariant_src_md()->dims, invariant_src_md()->ndims) + || has_zero_dims( + invariant_wei_md(0)->dims, invariant_wei_md(0)->ndims) + || has_zero_dims( + invariant_dst_md()->dims, invariant_dst_md()->ndims); + } +}; +struct cudnn_convolution_bwd_weights_pd_t + : public convolution_bwd_weights_pd_t { + using convolution_bwd_weights_pd_t::convolution_bwd_weights_pd_t; + + bool set_alg_kind(alg_kind_t kind) { return set_default_alg_kind(kind); } + + bool check_for_zero_dims() const { + return has_zero_dims( + invariant_src_md()->dims, invariant_src_md()->ndims) + || has_zero_dims( + invariant_wei_md(0)->dims, invariant_wei_md(0)->ndims) + || has_zero_dims( + invariant_dst_md()->dims, invariant_dst_md()->ndims); + } +}; + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl +#endif diff --git a/src/gpu/nvidia/cudnn_deconvolution.cpp b/src/gpu/nvidia/cudnn_deconvolution.cpp new file mode 100644 index 00000000000..84083ce10a0 --- /dev/null +++ b/src/gpu/nvidia/cudnn_deconvolution.cpp @@ -0,0 +1,57 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "gpu/nvidia/cudnn_deconvolution.hpp" +#include "gpu/nvidia/sycl_cuda_scoped_context.hpp" +#include "gpu/nvidia/sycl_cuda_stream.hpp" +#include "gpu/nvidia/sycl_cuda_utils.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +status_t cudnn_deconvolution_bwd_weights_t::execute_bias( + const exec_ctx_t &ctx) const { + if (memory_desc_wrapper(pd()->diff_dst_md(0)).has_zero_dim()) + return status::success; + + nvidia::sycl_cuda_stream_t *cuda_stream + = utils::downcast(ctx.stream()); + + return cuda_stream->interop_task([&](cl::sycl::handler &cgh) { + auto bias_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_BIAS); + auto y_acc = CTX_IN_ACCESSOR(DNNL_ARG_DIFF_DST); + + cgh.interop_task([=](const cl::sycl::interop_handler &ih) { + auto &sycl_engine = *utils::downcast( + cuda_stream->engine()); + auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine); + auto handle = cuda_stream->get_cudnn_handle(); + + auto bias = sc.memory(ih, bias_acc); + auto y = sc.memory(ih, y_acc); + + impl_->execute_bias(handle, y, bias); + }); + }); +} + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl diff --git a/src/gpu/nvidia/cudnn_deconvolution.hpp b/src/gpu/nvidia/cudnn_deconvolution.hpp new file mode 100644 index 00000000000..69cd596e1bb --- /dev/null +++ b/src/gpu/nvidia/cudnn_deconvolution.hpp @@ -0,0 +1,476 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_NVIDIA_CUDNN_DECONVOLUTION_HPP +#define GPU_NVIDIA_CUDNN_DECONVOLUTION_HPP + +#include "cudnn.h" + +#include "common/c_types_map.hpp" +#include "common/deconvolution_pd.hpp" +#include "common/primitive_iterator.hpp" +#include "gpu/nvidia/cudnn_convolution.hpp" +#include "gpu/nvidia/cudnn_deconvolution_impl.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +namespace { +static status_t compute_blocked_format( + bool with_groups, const memory_desc_t *oi_md, memory_desc_t *io_md) { + /* Computes blocking for *i*o* format from *o*i* format */ + + bool sanity_check_ok = true && oi_md->ndims == io_md->ndims + && oi_md->format_kind == format_kind::blocked; + if (!sanity_check_ok) return status::invalid_arguments; + + const blocking_desc_t &oi_blk = oi_md->format_desc.blocking; + blocking_desc_t io_blk = io_md->format_desc.blocking; + + io_md->format_kind = format_kind::blocked; + io_blk = oi_blk; + + const int ID_OC = 0 + with_groups; + const int ID_IC = 1 + with_groups; + + nstl::swap(io_blk.strides[ID_OC], io_blk.strides[ID_IC]); + for (int i_blk = 0; i_blk < io_blk.inner_nblks; ++i_blk) { + if (utils::one_of(io_blk.inner_idxs[i_blk], ID_OC, ID_IC)) { + io_blk.inner_idxs[i_blk] + = (io_blk.inner_idxs[i_blk] == ID_OC ? ID_IC : ID_OC); + } + } + + return memory_desc_init_by_blocking_desc(*io_md, io_blk); +} + +static status_t conv_descr_create( + const deconvolution_desc_t *dd, convolution_desc_t *cd) { + using namespace prop_kind; + alg_kind_t alg_kind = dd->alg_kind == alg_kind::deconvolution_direct + ? alg_kind::convolution_direct + : alg_kind::convolution_winograd; + + const memory_desc_t *src_md, *dst_md, *d_weights_d; + prop_kind_t prop_kind; + memory_desc_t c_weights_d; + if (utils::one_of(dd->prop_kind, forward_training, forward_inference)) { + prop_kind = backward_data; + src_md = &dd->dst_desc; + dst_md = &dd->src_desc; + d_weights_d = &dd->weights_desc; + } else if (dd->prop_kind == backward_data) { + prop_kind = forward_training; + src_md = &dd->diff_dst_desc; + dst_md = &dd->diff_src_desc; + d_weights_d = &dd->weights_desc; + } else { + prop_kind = dd->prop_kind; + src_md = &dd->diff_dst_desc; + dst_md = &dd->src_desc; + d_weights_d = &dd->diff_weights_desc; + } + + const bool with_groups = d_weights_d->ndims == src_md->ndims + 1; + + /* create weights desc for convolution */ + c_weights_d = *d_weights_d; + + const int ID_OC = 0 + with_groups; + const int ID_IC = 1 + with_groups; + + nstl::swap(c_weights_d.dims[ID_OC], c_weights_d.dims[ID_IC]); + nstl::swap(c_weights_d.padded_dims[ID_OC], c_weights_d.padded_dims[ID_IC]); + nstl::swap(c_weights_d.padded_offsets[ID_OC], + c_weights_d.padded_offsets[ID_IC]); + + if (c_weights_d.format_kind != format_kind::any) + CHECK(compute_blocked_format(with_groups, d_weights_d, &c_weights_d)); + + return conv_desc_init(cd, prop_kind, alg_kind, src_md, &c_weights_d, + prop_kind != backward_weights ? &dd->bias_desc : nullptr, dst_md, + dd->strides, dd->dilates, dd->padding[0], dd->padding[1]); +} +} // namespace + +struct cudnn_deconvolution_fwd_t : public primitive_t { + struct pd_t : public deconvolution_fwd_pd_t { + pd_t(const deconvolution_desc_t *adesc, const primitive_attr_t *attr, + const deconvolution_fwd_pd_t *hint_fwd_pd) + : deconvolution_fwd_pd_t(adesc, attr, hint_fwd_pd) + , conv_pd_(nullptr) {} + + pd_t(const pd_t &other) + : deconvolution_fwd_pd_t(other) + , conv_pd_(other.conv_pd_->clone()) + , conv_supports_bias_(other.conv_supports_bias_) + , dst_tag_(other.dst_tag_) {} + + DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_deconvolution_fwd_t); + + status_t init_convolution(engine_t *engine) { + using namespace format_tag; + using namespace data_type; + + convolution_desc_t cd; + CHECK(conv_descr_create(desc(), &cd)); + primitive_attr_t conv_attr = *attr(); + conv_attr.set_scratchpad_mode(scratchpad_mode::user); + dnnl_primitive_desc_iterator it( + engine, (op_desc_t *)&cd, &conv_attr, nullptr); + while (++it != it.end()) { + primitive_desc_t *conv_pd = it.fetch_once(); + conv_supports_bias_ + = static_cast(conv_pd) + ->support_bias(); + bool ref_deconv_supports_bias = true + && desc()->accum_data_type == data_type::f32 + && utils::one_of(desc()->dst_desc.data_type, f32, f16) + && IMPLICATION(desc()->src_desc.data_type == f16, + memory_desc_matches_one_of_tag( + *conv_pd->diff_src_md(), + utils::pick(ndims() - 3, ncw, nchw, + ncdhw))); + bool ok = true + && conv_pd->weights_md()->extra.flags == 0 + /* deconv reference code can process only f32 bias */ + && IMPLICATION(with_bias(), + conv_supports_bias_ + || ref_deconv_supports_bias); + if (ok) { + conv_pd_.reset(conv_pd); + return status::success; + } + } + conv_pd_.reset(); + return status::unimplemented; + } + + status_t init(engine_t *engine) { + using namespace format_tag; + bool ok = true && is_fwd(); + ok = ok + && utils::one_of(desc()->alg_kind, + alg_kind::deconvolution_direct, + alg_kind::deconvolution_winograd); + ok = ok && attr_.has_default_values(); + ok = ok + && (utils::everyone_is(data_type::f32, + desc()->src_desc.data_type, + desc()->weights_desc.data_type, + desc()->dst_desc.data_type) + || utils::everyone_is(data_type::f16, + desc()->src_desc.data_type, + desc()->weights_desc.data_type, + desc()->dst_desc.data_type)); + + if (ok) { + CHECK(init_convolution(engine)); + if (weights_md_.format_kind == format_kind::any) { + CHECK(compute_blocked_format(with_groups(), + conv_pd_->weights_md(), &desc_.weights_desc)); + weights_md_ = desc_.weights_desc; + } + if (src_md_.format_kind == format_kind::any) + src_md_ = *conv_pd_->diff_dst_md(); + if (dst_md_.format_kind == format_kind::any) + dst_md_ = *conv_pd_->diff_src_md(); + if (bias_md_.format_kind == format_kind::any) + CHECK(memory_desc_init_by_tag(bias_md_, x)); + + dst_tag_ = memory_desc_matches_one_of_tag(dst_md_, + utils::pick(ndims() - 3, ncw, nchw, ncdhw), + utils::pick(ndims() - 3, nCw4c, nChw4c, nCdhw4c)); + init_scratchpad(); + return status::success; + } + + return status::unimplemented; + } + + void init_scratchpad() { + auto scratchpad = scratchpad_registry().registrar(); + scratchpad.book(memory_tracking::names::key_nested, + conv_pd_->scratchpad_registry()); + } + + std::unique_ptr conv_pd_; + bool conv_supports_bias_; + format_tag_t dst_tag_; + }; + + cudnn_deconvolution_fwd_t(const pd_t *apd) : primitive_t(apd) {} + + ~cudnn_deconvolution_fwd_t() {} + + virtual status_t init(engine_t *engine) { + return pd()->conv_pd_->create_primitive(conv_p_, engine); + } + + status_t execute(const exec_ctx_t &ctx) const { + using namespace memory_tracking::names; + const auto &args = ctx.args(); + exec_args_t conv_args; + conv_args[DNNL_ARG_DIFF_DST] = args.at(DNNL_ARG_SRC); + conv_args[DNNL_ARG_WEIGHTS] = args.at(DNNL_ARG_WEIGHTS); + conv_args[DNNL_ARG_DIFF_SRC] = args.at(DNNL_ARG_DST); + if (pd()->with_bias()) + conv_args[DNNL_ARG_BIAS] = args.at(DNNL_ARG_BIAS); + exec_ctx_t conv_ctx(ctx.stream(), std::move(conv_args)); + + nested_scratchpad_t ns(ctx, key_nested, conv_p_); + conv_ctx.set_scratchpad_grantor(ns.grantor()); + // Executing the convolution kernel + status_t status = conv_p_->execute(conv_ctx); + return status; + } + +private: + const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } + std::shared_ptr conv_p_; +}; + +struct cudnn_deconvolution_bwd_data_t : public primitive_t { + struct pd_t : public deconvolution_bwd_data_pd_t { + pd_t(const deconvolution_desc_t *adesc, const primitive_attr_t *attr, + const deconvolution_fwd_pd_t *hint_fwd_pd) + : deconvolution_bwd_data_pd_t(adesc, attr, hint_fwd_pd) + , conv_pd_(nullptr) {} + + pd_t(const pd_t &other) + : deconvolution_bwd_data_pd_t(other) + , conv_pd_(other.conv_pd_->clone()) {} + + ~pd_t() {} + + DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_deconvolution_bwd_data_t); + + status_t init_convolution(engine_t *engine) { + convolution_desc_t cd; + CHECK(conv_descr_create(desc(), &cd)); + primitive_attr_t conv_attr = *attr(); + conv_attr.set_scratchpad_mode(scratchpad_mode::user); + dnnl_primitive_desc_iterator it( + engine, (op_desc_t *)&cd, &conv_attr, nullptr); + while (++it != it.end()) { + primitive_desc_t *_conv_pd = it.fetch_once(); + conv_pd_.reset(_conv_pd); + return status::success; + } + return status::unimplemented; + } + + status_t init(engine_t *engine) { + bool ok = true && desc()->prop_kind == prop_kind::backward_data + && (utils::everyone_is(data_type::f32, + desc()->diff_src_desc.data_type, + desc()->weights_desc.data_type, + desc()->diff_dst_desc.data_type) + || utils::everyone_is(data_type::f16, + desc()->weights_desc.data_type, + desc()->diff_dst_desc.data_type)) + && utils::one_of(desc()->diff_src_desc.data_type, + data_type::f16, data_type::f32) + && desc()->alg_kind == alg_kind::deconvolution_direct + && attr()->has_default_values(); + + if (ok) { + CHECK(init_convolution(engine)); + if (weights_md_.format_kind == format_kind::any) { + CHECK(compute_blocked_format(with_groups(), + conv_pd_->weights_md(), &desc_.weights_desc)); + weights_md_ = desc_.weights_desc; + } + if (diff_src_md_.format_kind == format_kind::any) + diff_src_md_ = *conv_pd_->dst_md(); + if (diff_dst_md_.format_kind == format_kind::any) + diff_dst_md_ = *conv_pd_->src_md(); + init_scratchpad(); + return status::success; + } + + return status::unimplemented; + } + + void init_scratchpad() { + auto scratchpad = scratchpad_registry().registrar(); + scratchpad.book(memory_tracking::names::key_nested, + conv_pd_->scratchpad_registry()); + } + + std::unique_ptr conv_pd_; + }; + + cudnn_deconvolution_bwd_data_t(const pd_t *apd) : primitive_t(apd) {} + + ~cudnn_deconvolution_bwd_data_t() {} + + virtual status_t init(engine_t *engine) { + return pd()->conv_pd_->create_primitive(conv_p_, engine); + } + + status_t execute(const exec_ctx_t &ctx) const { + using namespace memory_tracking::names; + const auto &args = ctx.args(); + exec_args_t conv_args; + conv_args[DNNL_ARG_SRC] = args.at(DNNL_ARG_DIFF_DST); + conv_args[DNNL_ARG_WEIGHTS] = args.at(DNNL_ARG_WEIGHTS); + conv_args[DNNL_ARG_DST] = args.at(DNNL_ARG_DIFF_SRC); + if (!types::is_zero_md(pd()->scratchpad_md())) + conv_args[DNNL_ARG_SCRATCHPAD] = args.at(DNNL_ARG_SCRATCHPAD); + exec_ctx_t conv_ctx(ctx.stream(), std::move(conv_args)); + + nested_scratchpad_t ns(ctx, key_nested, conv_p_); + conv_ctx.set_scratchpad_grantor(ns.grantor()); + // Executing the convolution kernel + status_t status = conv_p_->execute(conv_ctx); + return status; + } + +private: + const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } + std::shared_ptr conv_p_; +}; + +struct cudnn_deconvolution_bwd_weights_t : public primitive_t { + struct pd_t : public deconvolution_bwd_weights_pd_t { + pd_t(const deconvolution_desc_t *adesc, const primitive_attr_t *attr, + const deconvolution_fwd_pd_t *hint_fwd_pd) + : deconvolution_bwd_weights_pd_t(adesc, attr, hint_fwd_pd) + , conv_pd_(nullptr) {} + + pd_t(const pd_t &other) + : deconvolution_bwd_weights_pd_t(other) + , conv_pd_(other.conv_pd_->clone()) {} + + ~pd_t() {} + + DECLARE_COMMON_PD_T( + "cuda:cudnn:any", cudnn_deconvolution_bwd_weights_t); + + status_t init_convolution(engine_t *engine) { + convolution_desc_t cd; + CHECK(conv_descr_create(desc(), &cd)); + primitive_attr_t conv_attr = *attr(); + conv_attr.set_scratchpad_mode(scratchpad_mode::user); + dnnl_primitive_desc_iterator it( + engine, (op_desc_t *)&cd, &conv_attr, nullptr); + while (++it != it.end()) { + primitive_desc_t *_conv_pd = it.fetch_once(); + conv_pd_.reset(_conv_pd); + if (conv_pd_ == nullptr) return status::out_of_memory; + return status::success; + } + return status::unimplemented; + } + + status_t init(engine_t *engine) { + using namespace format_tag; + bool ok = true && desc()->prop_kind == prop_kind::backward_weights + && (utils::everyone_is(data_type::f32, + desc()->src_desc.data_type, + desc()->diff_weights_desc.data_type, + desc()->diff_dst_desc.data_type) + || utils::everyone_is(data_type::f16, + desc()->diff_dst_desc.data_type, + desc()->src_desc.data_type)) + && utils::one_of( + desc()->alg_kind, alg_kind::deconvolution_direct) + && attr()->has_default_values() + && utils::one_of(desc()->diff_weights_desc.data_type, + data_type::f16, data_type::f32); + if (ok) { + CHECK(init_convolution(engine)); + if (diff_weights_md_.format_kind == format_kind::any) { + CHECK(compute_blocked_format(with_groups(), + conv_pd_->diff_weights_md(), + &desc_.diff_weights_desc)); + diff_weights_md_ = desc_.diff_weights_desc; + } + if (src_md_.format_kind == format_kind::any) + src_md_ = *conv_pd_->diff_dst_md(); + if (diff_dst_md_.format_kind == format_kind::any) + diff_dst_md_ = *conv_pd_->src_md(); + if (diff_bias_md_.format_kind == format_kind::any) + CHECK(memory_desc_init_by_tag(diff_bias_md_, x)); + init_scratchpad(); + return status::success; + } + + return status::unimplemented; + } + + void init_scratchpad() { + auto scratchpad = scratchpad_registry().registrar(); + scratchpad.book(memory_tracking::names::key_nested, + conv_pd_->scratchpad_registry()); + } + + std::unique_ptr conv_pd_; + }; + + cudnn_deconvolution_bwd_weights_t(const pd_t *apd) : primitive_t(apd) {} + + ~cudnn_deconvolution_bwd_weights_t() {} + + virtual status_t init(engine_t *engine) { + if (pd()->with_bias()) { + if (pd()->ndims() > CUDNN_DIM_MAX) return status::invalid_arguments; + + impl_ = std::make_shared(); + impl_->init(pd()->invariant_dst_md(), pd()->invariant_bia_md()); + } + return pd()->conv_pd_->create_primitive(conv_p_, engine); + } + + status_t execute(const exec_ctx_t &ctx) const { + using namespace memory_tracking::names; + const auto &args = ctx.args(); + exec_args_t conv_args; + conv_args[DNNL_ARG_DIFF_DST] = args.at(DNNL_ARG_SRC); + conv_args[DNNL_ARG_SRC] = args.at(DNNL_ARG_DIFF_DST); + conv_args[DNNL_ARG_DIFF_WEIGHTS] = args.at(DNNL_ARG_DIFF_WEIGHTS); + if (!types::is_zero_md(pd()->scratchpad_md())) + conv_args[DNNL_ARG_SCRATCHPAD] = args.at(DNNL_ARG_SCRATCHPAD); + + exec_ctx_t conv_ctx(ctx, std::move(conv_args)); + + nested_scratchpad_t ns(ctx, key_nested, conv_p_); + conv_ctx.set_scratchpad_grantor(ns.grantor()); + status_t status = conv_p_->execute(conv_ctx); + if (status != status::success) return status; + + if (pd()->with_bias()) { return execute_bias(ctx); } + return status::success; + } + + status_t execute_bias(const exec_ctx_t &ctx) const; + +private: + const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } + std::shared_ptr conv_p_; + std::shared_ptr impl_; +}; + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/nvidia/cudnn_deconvolution_impl.hpp b/src/gpu/nvidia/cudnn_deconvolution_impl.hpp new file mode 100644 index 00000000000..2dd9477173d --- /dev/null +++ b/src/gpu/nvidia/cudnn_deconvolution_impl.hpp @@ -0,0 +1,92 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_NVIDIA_CUDNN_DECONVOLUTION_IMPL_HPP +#define GPU_NVIDIA_CUDNN_DECONVOLUTION_IMPL_HPP + +#include "cudnn.h" + +#include "common/c_types_map.hpp" +#include "common/deconvolution_pd.hpp" +#include "gpu/nvidia/cudnn_convolution_pd.hpp" +#include "gpu/nvidia/sycl_cuda_engine.hpp" +#include "gpu/nvidia/sycl_cuda_utils.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +struct cudnn_deconvolution_bwd_bias_impl_t { +protected: + enum io { y = 0, bias, NUM_IO }; + memory_desc_t dnnl_descs[NUM_IO]; + cudnnTensorDescriptor_t descs[NUM_IO]; + int dims[NUM_IO][DNNL_MAX_NDIMS]; + int strides[NUM_IO][DNNL_MAX_NDIMS]; + int ndims[NUM_IO]; + cudnnDataType_t data_types[NUM_IO]; + +public: + ~cudnn_deconvolution_bwd_bias_impl_t() { + for (size_t i = 0; i < NUM_IO; i++) { + if (descs[i]) { + CUDNN_EXECUTE_FUNC_V(cudnnDestroyTensorDescriptor, descs[i]); + } + } + } + + status_t init(const memory_desc_t *dst, const memory_desc_t *bia) { + dnnl_descs[y] = *dst; + dnnl_descs[bias] = *bia; + + ndims[y] = dnnl_descs[y].ndims; + ndims[bias] = dnnl_descs[bias].ndims; + convert_dims(dnnl_descs[y].padded_dims, dims[y], ndims[y]); + CHECK(convert_data_type(&dnnl_descs[y], &data_types[y])); + CHECK(convert_data_type(&dnnl_descs[bias], &data_types[bias])); + convert_dims(dnnl_descs[y].format_desc.blocking.strides, strides[y], + ndims[y]); + ndims[y] = std::max(4, ndims[y]); + convert_dims(dnnl_descs[bias].format_desc.blocking.strides, + strides[bias], ndims[bias], ndims[y]); + convert_dims(dnnl_descs[bias].padded_dims, dims[bias], ndims[bias], + ndims[y]); + std::swap(dims[bias][0], dims[bias][1]); + ndims[bias] = ndims[y]; + CHECK(create_and_set_tensor_descriptor( + &descs[y], data_types[y], ndims[y], dims[y], strides[y])); + CHECK(create_and_set_tensor_descriptor(&descs[bias], data_types[bias], + ndims[bias], dims[bias], strides[bias])); + + return status::success; + } + + void execute_bias(cudnnHandle_t handle, void *y, void *bias) const { + const float bias_alpha = 1.0f; + const float bias_beta = 0.0f; + CUDNN_EXECUTE_FUNC_V(cudnnConvolutionBackwardBias, handle, &bias_alpha, + descs[io::y], y, &bias_beta, descs[io::bias], bias); + } +}; + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/nvidia/cudnn_eltwise.cpp b/src/gpu/nvidia/cudnn_eltwise.cpp new file mode 100644 index 00000000000..29ae0aa59f6 --- /dev/null +++ b/src/gpu/nvidia/cudnn_eltwise.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "gpu/nvidia/cudnn_eltwise.hpp" +#include "gpu/nvidia/sycl_cuda_scoped_context.hpp" +#include "gpu/nvidia/sycl_cuda_stream.hpp" +#include "sycl/sycl_buffer_memory_storage.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +status_t cudnn_eltwise_fwd_t::execute(const exec_ctx_t &ctx) const { + if (memory_desc_wrapper(pd()->src_md()).has_zero_dim()) + return status::success; + + nvidia::sycl_cuda_stream_t *cuda_stream + = utils::downcast(ctx.stream()); + + return cuda_stream->interop_task([&](cl::sycl::handler &cgh) { + auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC); + auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST); + + cgh.interop_task([=](const cl::sycl::interop_handler &ih) { + std::vector args; + auto &sycl_engine = *utils::downcast( + cuda_stream->engine()); + auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine); + auto handle = cuda_stream->get_cudnn_handle(); + + args.push_back(sc.memory(ih, src_acc)); + args.push_back(sc.memory(ih, dst_acc)); + + pd()->eltwise_fwd_impl_->execute(handle, args.data(), args.size()); + }); + }); +} + +status_t cudnn_eltwise_bwd_t::execute(const exec_ctx_t &ctx) const { + if (memory_desc_wrapper(pd()->src_md()).has_zero_dim()) + return status::success; + + nvidia::sycl_cuda_stream_t *cuda_stream + = utils::downcast(ctx.stream()); + + return cuda_stream->interop_task([&](cl::sycl::handler &cgh) { + auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC); + auto diff_dst_acc = CTX_IN_ACCESSOR(DNNL_ARG_DIFF_DST); + auto diff_src_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_SRC); + + cgh.interop_task([=](const cl::sycl::interop_handler &ih) { + std::vector args; + auto &sycl_engine = *utils::downcast( + cuda_stream->engine()); + auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine); + auto handle = cuda_stream->get_cudnn_handle(); + + args.push_back(sc.memory(ih, src_acc)); + args.push_back(sc.memory(ih, diff_dst_acc)); + args.push_back(sc.memory(ih, diff_src_acc)); + + pd()->eltwise_bwd_impl_->execute(handle, args.data(), args.size()); + }); + }); +} + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl diff --git a/src/gpu/nvidia/cudnn_eltwise.hpp b/src/gpu/nvidia/cudnn_eltwise.hpp new file mode 100644 index 00000000000..5fd572a411b --- /dev/null +++ b/src/gpu/nvidia/cudnn_eltwise.hpp @@ -0,0 +1,116 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_NVIDIA_SYCL_CUDA_ELTWISE_HPP +#define GPU_NVIDIA_SYCL_CUDA_ELTWISE_HPP + +#include "common/eltwise_pd.hpp" +#include "common/primitive.hpp" +#include "gpu/nvidia/cudnn_eltwise_impl.hpp" +#include "gpu/nvidia/sycl_cuda_engine.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +struct cudnn_eltwise_fwd_t : public primitive_t { + + struct pd_t : public eltwise_fwd_pd_t { + using eltwise_fwd_pd_t::eltwise_fwd_pd_t; + + DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_eltwise_fwd_t); + + status_t init(engine_t *) { + using namespace alg_kind; + bool ok = true + && utils::one_of(desc()->prop_kind, + prop_kind::forward_training, + prop_kind::forward_inference) + // Supported algorithms + && utils::one_of(desc()->alg_kind, eltwise_relu, + eltwise_bounded_relu, eltwise_tanh, eltwise_elu, + eltwise_logistic) + // Supported data types + && utils::one_of(desc()->data_desc.data_type, + data_type::f32, data_type::f16, data_type::s8) + && IMPLICATION(desc()->alg_kind == eltwise_relu, + desc()->alpha == 0) + // Eltwise does not support blocking + && src_md()->format_desc.blocking.inner_nblks == 0 + && attr()->has_default_values(); + if (!ok) return status::unimplemented; + + eltwise_fwd_impl_.reset(new cudnn_eltwise_fwd_impl_t()); + return eltwise_fwd_impl_->init(this); + } + std::shared_ptr eltwise_fwd_impl_; + }; + + cudnn_eltwise_fwd_t(const pd_t *apd) : primitive_t(apd) {} + + status_t execute(const exec_ctx_t &ctx) const override; + +private: + const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } +}; + +struct cudnn_eltwise_bwd_t : public primitive_t { + + struct pd_t : public eltwise_bwd_pd_t { + using eltwise_bwd_pd_t::eltwise_bwd_pd_t; + + DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_eltwise_bwd_t); + + status_t init(engine_t *) { + using namespace alg_kind; + bool ok = true + && desc()->prop_kind == prop_kind::backward_data + // Supported algorithms + && utils::one_of(desc()->alg_kind, eltwise_bounded_relu, + eltwise_relu) + // Supported data types + && desc()->data_desc.data_type == data_type::f32 + && IMPLICATION(desc()->alg_kind == eltwise_relu, + desc()->alpha == 0) + && set_default_formats_common() + // Eltwise does not support blocking + && src_md()->format_desc.blocking.inner_nblks == 0 + && diff_dst_md()->format_desc.blocking.inner_nblks == 0 + && attr()->has_default_values(); + if (!ok) return status::unimplemented; + + eltwise_bwd_impl_.reset(new cudnn_eltwise_bwd_impl_t()); + return eltwise_bwd_impl_->init(this); + } + std::shared_ptr eltwise_bwd_impl_; + }; + + cudnn_eltwise_bwd_t(const pd_t *apd) : primitive_t(apd) {} + + status_t execute(const exec_ctx_t &ctx) const override; + +private: + const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } +}; + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/nvidia/cudnn_eltwise_impl.hpp b/src/gpu/nvidia/cudnn_eltwise_impl.hpp new file mode 100644 index 00000000000..24c3b0331ff --- /dev/null +++ b/src/gpu/nvidia/cudnn_eltwise_impl.hpp @@ -0,0 +1,203 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_NVIDIA_SYCL_CUDA_ELTWISE_IMPL_HPP +#define GPU_NVIDIA_SYCL_CUDA_ELTWISE_IMPL_HPP + +#include "cudnn.h" + +#include "gpu/nvidia/sycl_cuda_utils.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +struct cudnn_eltwise_impl_base_t { + +public: + virtual status_t init(const eltwise_pd_t *pd) = 0; + + virtual void execute(cudnnHandle_t handle, void **x, int size) const = 0; + + virtual status_t create_and_set_act_descriptor() { + CHECK(CUDNN_EXECUTE_FUNC_S( + cudnnCreateActivationDescriptor, &act_desc_)); + + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetActivationDescriptor, act_desc_, + alg_kind, cudnnNanPropagation_t::CUDNN_PROPAGATE_NAN, coef)); + + return status::success; + } + + // Mapping between dnnl algorithm and cuDNN activation mode + status_t convert_alg_kind( + alg_kind_t alg_kind, cudnnActivationMode_t *cuda_alg_kind) const { + switch (alg_kind) { + case alg_kind::eltwise_relu: + *cuda_alg_kind = cudnnActivationMode_t::CUDNN_ACTIVATION_RELU; + break; + case alg_kind::eltwise_bounded_relu: + *cuda_alg_kind + = cudnnActivationMode_t::CUDNN_ACTIVATION_CLIPPED_RELU; + break; + case alg_kind::eltwise_tanh: + *cuda_alg_kind = cudnnActivationMode_t::CUDNN_ACTIVATION_TANH; + break; + case alg_kind::eltwise_elu: + *cuda_alg_kind = cudnnActivationMode_t::CUDNN_ACTIVATION_ELU; + break; + case alg_kind::eltwise_logistic: + *cuda_alg_kind + = cudnnActivationMode_t::CUDNN_ACTIVATION_SIGMOID; + break; + default: return status::unimplemented; + } + return status::success; + } + + virtual ~cudnn_eltwise_impl_base_t() { + if (act_desc_) { + CUDNN_EXECUTE_FUNC_V(cudnnDestroyActivationDescriptor, act_desc_); + } + } + +protected: + int ndims; + cudnnActivationDescriptor_t act_desc_ = nullptr; + cudnnActivationMode_t alg_kind; + // alpha and beta are post operation scaling parameters used by cuDNN + float alpha = 1; + float beta = 0; + // coef in cuDNN is use for Relu (is equal to zero) and BRelu (represents + // the bound) + double coef = 0; +}; + +struct cudnn_eltwise_fwd_impl_t : public cudnn_eltwise_impl_base_t { +public: + status_t init(const eltwise_pd_t *pd) override { + // If any of the dimensions are 0 we should not continue with creating + // cudnn descriptors + if (has_zero_dims(pd->src_md()->dims, pd->ndims())) { + return status::success; + } + if (pd->ndims() > CUDNN_DIM_MAX) { return status::invalid_arguments; } + ndims = pd->ndims() < 4 ? 4 : pd->ndims(); + + // Obtain source and destination dimensions, strides and datatype + convert_dims(pd->src_md()->padded_dims, dims_, pd->ndims()); + convert_dims(pd->src_md()->format_desc.blocking.strides, strides_, + pd->ndims()); + CHECK(convert_data_type(pd->src_md(), &data_type_)); + + // Get cuDNN activation mode + alg_kind_t alg = pd->desc()->alg_kind; + auto alg_ok = convert_alg_kind(alg, &alg_kind); + if (alg_ok != status::success) { return status::unimplemented; } + coef = pd->desc()->alpha; + + CHECK(create_and_set_tensor_descriptor( + &tensor_desc_, data_type_, ndims, dims_, strides_)); + CHECK(create_and_set_act_descriptor()); + return status::success; + } + + void execute(cudnnHandle_t handle, void **x, int size) const override { + // Confirm that 2 arguments were passed src and dst + assert(size == 2); + CUDNN_EXECUTE_FUNC(cudnnActivationForward, handle, act_desc_, &alpha, + tensor_desc_, x[0], &beta, tensor_desc_, x[1]); + } + + ~cudnn_eltwise_fwd_impl_t() { + CUDNN_EXECUTE_FUNC_V(cudnnDestroyTensorDescriptor, tensor_desc_); + } + +private: + int strides_[DNNL_MAX_NDIMS]; + int dims_[DNNL_MAX_NDIMS]; + cudnnDataType_t data_type_; + cudnnTensorDescriptor_t tensor_desc_; +}; + +struct cudnn_eltwise_bwd_impl_t : public cudnn_eltwise_impl_base_t { + +public: + status_t init(const eltwise_pd_t *pd) override { + // If any of the dimensions are 0 we should not continue with creating + // cudnn descriptors + if (memory_desc_wrapper(pd->desc()->data_desc).has_zero_dim()) + return status::success; + + if (pd->ndims() > CUDNN_DIM_MAX) { return status::invalid_arguments; } + ndims = pd->ndims() < 4 ? 4 : pd->ndims(); + + // Obtain dimension and strides for the backward eltwise operation + convert_dims(pd->src_md()->padded_dims, dims_, pd->ndims()); + + convert_dims(pd->src_md()->format_desc.blocking.strides, strides_, + pd->ndims()); + + alg_kind_t alg = pd->desc()->alg_kind; + auto alg_ok = convert_alg_kind(alg, &alg_kind); + if (alg_ok != status::success) { return status::unimplemented; } + coef = pd->desc()->alpha; + + // Check validity of input + assert(pd->diff_dst_md()->data_type == pd->src_md()->data_type); + assert(pd->diff_dst_md()->data_type == pd->diff_src_md()->data_type); + + CHECK(convert_data_type(pd->src_md(), &data_type_)); + + CHECK(create_and_set_tensor_descriptor( + &tensor_desc_src_, data_type_, ndims, dims_, strides_)); + CHECK(create_and_set_tensor_descriptor( + &tensor_diff_desc_, data_type_, ndims, dims_, strides_)); + CHECK(create_and_set_act_descriptor()); + return status::success; + } + + void execute(cudnnHandle_t handle, void **x, int size) const override { + // Assert that 3 arguments were passed src, diff_dst and diff_src + assert(size == 3); + void *dy = x[1]; + void *dx = x[2]; + CUDNN_EXECUTE_FUNC(cudnnActivationBackward, handle, act_desc_, &alpha, + tensor_desc_src_, x[0], tensor_diff_desc_, dy, tensor_desc_src_, + x[0], &beta, tensor_diff_desc_, dx); + } + + ~cudnn_eltwise_bwd_impl_t() { + CUDNN_EXECUTE_FUNC_V(cudnnDestroyTensorDescriptor, tensor_desc_src_); + CUDNN_EXECUTE_FUNC_V(cudnnDestroyTensorDescriptor, tensor_diff_desc_); + } + +private: + int dims_[DNNL_MAX_NDIMS]; + int strides_[DNNL_MAX_NDIMS]; + cudnnTensorDescriptor_t tensor_diff_desc_; + cudnnDataType_t data_type_; + cudnnTensorDescriptor_t tensor_desc_src_; +}; + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/nvidia/cudnn_gemm_inner_product.hpp b/src/gpu/nvidia/cudnn_gemm_inner_product.hpp new file mode 100644 index 00000000000..90a1884c78c --- /dev/null +++ b/src/gpu/nvidia/cudnn_gemm_inner_product.hpp @@ -0,0 +1,347 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_NVIDIA_CUDNN_GEMM_INNER_PRODUCT_HPP +#define GPU_NVIDIA_CUDNN_GEMM_INNER_PRODUCT_HPP + +#include "cudnn.h" + +#include + +#include "common/c_types_map.hpp" +#include "common/inner_product_pd.hpp" +#include "common/primitive.hpp" +#include "gpu/nvidia/cudnn_gemm_inner_product_impl.hpp" +#include "gpu/nvidia/cudnn_inner_product.hpp" +#include "gpu/nvidia/sycl_cuda_engine.hpp" +#include "gpu/nvidia/sycl_cuda_stream.hpp" +#include "gpu/nvidia/sycl_cuda_utils.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { +namespace { + +inline bool gemm_consitency_check(const memory_desc_wrapper &src_d, + const memory_desc_wrapper &wei_d, const memory_desc_wrapper &dst_d) { + using namespace utils; + + auto strides_compatible = [&]() { + bool ok = true; + auto w_str = wei_d.blocking_desc().strides; + auto d_str = src_d.blocking_desc().strides; + for (int i = 1; i < src_d.ndims() - 1; i++) { + ok = ok && w_str[i] / d_str[i] == w_str[i + 1] / d_str[i + 1]; + } + return ok && one_of(w_str[1] / d_str[1], 1, wei_d.padded_dims()[0]); + }; + + auto inner_blk_compatible = [&]() { + auto d_inner_blks = src_d.blocking_desc().inner_blks; + auto w_inner_blks = wei_d.blocking_desc().inner_blks; + auto d_inner_idxs = src_d.blocking_desc().inner_idxs; + auto w_inner_idxs = wei_d.blocking_desc().inner_idxs; + + int d_inner_nblks = src_d.blocking_desc().inner_nblks; + int w_inner_nblks = wei_d.blocking_desc().inner_nblks; + + bool ok = true; + + if ((wei_d.blocking_desc().strides[0] == 1) && (w_inner_nblks > 0)) { + ok = ok && wei_d.dims()[0] / w_inner_blks[w_inner_nblks - 1] == 1 + && w_inner_idxs[w_inner_nblks - 1] == 0; + w_inner_nblks--; + } + // cudnn only supports blocking for channel C and type s8. Only + // blocksize 4 is supported. + ok = ok && d_inner_nblks == w_inner_nblks; + bool supported_block_size = (d_inner_nblks == 0 + || (d_inner_nblks == 1 && d_inner_idxs[0] == w_inner_idxs[0] + && w_inner_idxs[0] == 1 + && d_inner_blks[0] == w_inner_blks[0] + && d_inner_blks[0] == 4 + && src_d.data_type() == data_type::s8)); + ok = ok && supported_block_size; + for (int d = 1; d < w_inner_nblks; d++) + ok = ok && (d_inner_blks[d] == w_inner_blks[d] == 0) + && (d_inner_idxs[d] == w_inner_idxs[d] == 0); + return ok; + }; + + return true && src_d.is_blocking_desc() && wei_d.is_blocking_desc() + && src_d.ndims() == wei_d.ndims() && inner_blk_compatible() + && strides_compatible() && dst_d.matches_tag(format_tag::nc) + && src_d.only_padded_dim(1) && wei_d.only_padded_dim(1) + && src_d.padded_dims()[1] == wei_d.padded_dims()[1]; +} + +inline bool reorder_check(const memory_desc_wrapper &src_d, + const memory_desc_wrapper &wei_d, const memory_desc_wrapper &dst_d) { + using namespace format_tag; + using namespace utils; + return true + && ((src_d.matches_tag(nwc) + && (wei_d.matches_one_of_tag(oiw, iwo) != undef)) + || (src_d.matches_tag(ncw) + && (wei_d.matches_one_of_tag(wio, owi) != undef)) + || (src_d.matches_tag(nhwc), + (wei_d.matches_one_of_tag(oihw, ihwo) != undef)) + || (src_d.matches_tag(nchw) + && (wei_d.matches_one_of_tag(ohwi, hwio) != undef)) + || (src_d.matches_tag(ndhwc) + && (wei_d.matches_one_of_tag(oidhw, idhwo) + != undef)) + || (src_d.matches_tag(ncdhw) + && (wei_d.matches_one_of_tag(odhwi, dhwio) + != undef))) + && dst_d.matches_tag(nc); +} + +inline bool dense_check(const memory_desc_wrapper &src_d, + const memory_desc_wrapper &wei_d, const memory_desc_wrapper &dst_d) { + return true && src_d.is_dense(true) && dst_d.is_dense() + && wei_d.is_dense(true); +} + +status_t template_set_default_params(memory_desc_t &src_md, + memory_desc_t &weights_md, memory_desc_t &dst_md, + memory_desc_t *bias_md, int ndims) { + using namespace format_tag; + + auto init_md = [&](memory_desc_t &out_md, const memory_desc_t &in_md) { + format_tag_t md_tag; + if (memory_desc_matches_one_of_tag(in_md, ab, abc, abcd, abcde)) + md_tag = utils::pick(ndims - 2, ab, abc, abcd, abcde); + else if (memory_desc_matches_one_of_tag(in_md, acb, acdb, acdeb)) + md_tag = utils::pick(ndims - 3, cba, cdba, cdeba); + else if (memory_desc_matches_one_of_tag(in_md, ba, cba, cdba, cdeba)) + md_tag = utils::pick(ndims - 2, ab, acb, acdb, acdeb); + else { + memory_desc_wrapper md_desc_wrapper(in_md); + return memory_desc_init_by_blocking_desc( + out_md, md_desc_wrapper.blocking_desc()); + } + return memory_desc_init_by_tag(out_md, md_tag); + }; + if (src_md.format_kind == format_kind::any + && weights_md.format_kind == format_kind::any) { + CHECK(memory_desc_init_by_tag( + src_md, utils::pick(ndims - 2, nc, ncw, nchw, ncdhw))); + CHECK(memory_desc_init_by_tag( + weights_md, utils::pick(ndims - 2, oi, oiw, oihw, oidhw))); + } else if (src_md.format_kind == format_kind::any) { + CHECK(init_md(src_md, weights_md)); + } else if (weights_md.format_kind == format_kind::any) { + CHECK(init_md(weights_md, src_md)); + } + if (dst_md.format_kind == format_kind::any) { + CHECK(memory_desc_init_by_tag(dst_md, nc)); + } + if (bias_md->format_kind == format_kind::any) { + CHECK(memory_desc_init_by_tag(*bias_md, x)); + } + return status::success; +} + +} // namespace + +struct cudnn_gemm_inner_product_fwd_t : public cudnn_inner_product_fwd_t { + using cudnn_inner_product_fwd_t::cudnn_inner_product_fwd_t; + using parrent_pd_t = cudnn_inner_product_fwd_t::pd_t; + + struct pd_t : public parrent_pd_t { + using parrent_pd_t::parrent_pd_t; + + DECLARE_COMMON_PD_T("cuda:cudnn:gemm", cudnn_gemm_inner_product_fwd_t); + + status_t init(engine_t *engine) { + using namespace data_type; + using namespace prop_kind; + using namespace data_type; + + assert(engine->kind() == engine_kind::gpu); + bool ok = true && is_fwd() + && (set_default_params() == status::success); + if (!ok) return status::unimplemented; + if (has_zero_dim_memory()) return status::success; + bool gemm_compatible + = gemm_consitency_check(src_md(), weights_md(), dst_md()); + bool need_reorder = (gemm_compatible + ? false + : reorder_check(src_md(), weights_md(), dst_md())); + const auto attr_skip_mask = primitive_attr_t::skip_mask_t::oscale + | primitive_attr_t::skip_mask_t::post_ops; + + bool with_eltwise + = attr()->post_ops_.find(primitive_kind::eltwise) != -1; + bool with_sum = attr()->post_ops_.find(primitive_kind::sum) != -1; + ok = ok + && utils::one_of(true, + expect_data_types(f16, f16, f16, f16, f16), + expect_data_types(f16, f16, f32, f16, f32), + expect_data_types(s8, s8, f32, s8, s32), + expect_data_types(s8, s8, f32, f32, f32), + expect_data_types(f32, f32, f32, f32, f32)) + && memory_format_ok(src_md()) + && memory_format_ok(weights_md(0)) + && memory_format_ok(dst_md()) + && IMPLICATION(!attr()->output_scales_.has_default_values(), + utils::one_of(src_md_.data_type, s8) + && attr()->output_scales_.mask_ == 0) + && attr()->has_default_values(attr_skip_mask) + && post_ops_ok(attr()) + && dense_check(src_md(), weights_md(), dst_md()) + && (gemm_compatible || need_reorder); + if (!ok) return status::unimplemented; + + inner_product_impl_.reset( + new cudnn_gemm_inner_product_fwd_impl_t()); + return inner_product_impl_->init(engine, this, with_eltwise, + with_eltwise, with_sum, need_reorder); + } + + bool post_ops_ok(const primitive_attr_t *attr) const { + const auto &p = attr->post_ops_; + + auto is_eltwise + = [&](int idx) { return p.entry_[idx].is_eltwise(false); }; + auto is_sum = [&](int idx) { return p.entry_[idx].is_sum(false); }; + + switch (p.len()) { + case 0: return true; // no post_ops + case 1: return is_eltwise(0) || is_sum(0); // sum OR eltwise + case 2: return is_sum(0) && is_eltwise(1); // sum -> eltwise + default: return false; + } + + return false; + } + + status_t set_default_params() { + return template_set_default_params( + src_md_, weights_md_, dst_md_, &bias_md_, ndims()); + } + }; + + const pd_t *pd() const override { + return (const pd_t *)primitive_t::pd().get(); + } +}; + +struct cudnn_gemm_inner_product_bwd_data_t + : public cudnn_inner_product_bwd_data_t { + using cudnn_inner_product_bwd_data_t::cudnn_inner_product_bwd_data_t; + using parent_pd_t = cudnn_inner_product_bwd_data_t::pd_t; + + struct pd_t : public parent_pd_t { + using parent_pd_t::parent_pd_t; + + DECLARE_COMMON_PD_T( + "cuda:cudnn:gemm", cudnn_gemm_inner_product_bwd_data_t); + + status_t init(engine_t *engine) { + using namespace prop_kind; + using namespace data_type; + assert(engine->kind() == engine_kind::gpu); + bool ok = true && this->desc()->prop_kind == backward_data + && set_default_params() == status::success; + if (!ok) return status::unimplemented; + if (has_zero_dim_memory()) return status::success; + bool gemm_compatible = gemm_consitency_check( + diff_src_md(), weights_md(), diff_dst_md()); + bool need_reorder = gemm_compatible + ? false + : reorder_check(diff_src_md(), weights_md(), diff_dst_md()); + + ok = ok && expect_data_types(f32, f32, data_type::undef, f32, f32) + && attr()->has_default_values() + && dense_check(diff_src_md(), weights_md(), diff_dst_md()) + && (gemm_compatible || need_reorder); + if (!ok) return status::unimplemented; + + inner_product_impl_.reset( + new cudnn_gemm_inner_product_bwd_data_impl_t()); + + return inner_product_impl_->init( + engine, this, false, false, false, need_reorder); + } + + status_t set_default_params() { + return template_set_default_params(diff_src_md_, weights_md_, + diff_dst_md_, &glob_zero_md, ndims()); + } + }; + + const pd_t *pd() const override { + return (const pd_t *)primitive_t::pd().get(); + } +}; + +struct cudnn_gemm_inner_product_bwd_weights_t + : public cudnn_inner_product_bwd_weights_t { + using cudnn_inner_product_bwd_weights_t::cudnn_inner_product_bwd_weights_t; + using parent_pd_t = cudnn_inner_product_bwd_weights_t::pd_t; + + struct pd_t : public parent_pd_t { + using parent_pd_t::parent_pd_t; + + DECLARE_COMMON_PD_T( + "cuda:cudnn:gemm", cudnn_gemm_inner_product_bwd_weights_t); + + status_t init(engine_t *engine) { + using namespace prop_kind; + using namespace data_type; + assert(engine->kind() == engine_kind::gpu); + bool ok = true && this->desc()->prop_kind == backward_weights + && set_default_params() == status::success; + if (!ok) return status::unimplemented; + if (has_zero_dim_memory()) return status::success; + bool gemm_compatible = gemm_consitency_check( + src_md(), diff_weights_md(), diff_dst_md()); + bool need_reorder = gemm_compatible + ? false + : reorder_check(src_md(), diff_weights_md(), diff_dst_md()); + + ok = ok && expect_data_types(f32, f32, f32, f32, f32) + && attr()->has_default_values() + && dense_check(src_md(), diff_weights_md(), diff_dst_md()) + && (gemm_compatible || need_reorder); + if (!ok) return status::unimplemented; + inner_product_impl_.reset( + new cudnn_gemm_inner_product_bwd_weights_impl_t()); + return inner_product_impl_->init( + engine, this, false, false, false, need_reorder); + } + + status_t set_default_params() { + return template_set_default_params(src_md_, diff_weights_md_, + diff_dst_md_, &diff_bias_md_, ndims()); + } + }; + + const pd_t *pd() const override { + return (const pd_t *)primitive_t::pd().get(); + } +}; + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/nvidia/cudnn_gemm_inner_product_impl.hpp b/src/gpu/nvidia/cudnn_gemm_inner_product_impl.hpp new file mode 100644 index 00000000000..f177b7bc585 --- /dev/null +++ b/src/gpu/nvidia/cudnn_gemm_inner_product_impl.hpp @@ -0,0 +1,463 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_NVIDIA_CUDNN_GEMM_INNER_PRODUCT_IMPL_HPP +#define GPU_NVIDIA_CUDNN_GEMM_INNER_PRODUCT_IMPL_HPP + +#include "cublas_v2.h" +#include "cudnn.h" + +#include "common/type_helpers.hpp" +#include "gpu/nvidia/cudnn_inner_product_impl.hpp" +#include "gpu/nvidia/sycl_cuda_engine.hpp" +#include "gpu/nvidia/sycl_cuda_scoped_context.hpp" +#include "gpu/nvidia/sycl_cuda_utils.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +// GEMM Implementation +struct cudnn_gemm_inner_product_base_t { +protected: + int m_, n_, k_, lda_, ldb_, ldc_; + cublasOperation_t trans_a_, trans_b_; + // compute_type is always equal to c_type_; + // if datatype is f16 or s8 and bias is presented the compute type must be + // f32 and we need to do the operation in f32 + cudaDataType_t a_type_, b_type_, c_type_, + // Despite the claim in cuBlas + // (https://docs.nvidia.com/cuda/cublas/index.html#cublas-GemmEx) + // for the support of fp16 computation when all the types are fp16, + // in cublas 10.1, and 10.2, if the fp16 is chosen as a + // computation mode, it silently does no computation. So we force + // computation type to be f32 in order to get the correct result. + // This can be reverted when the bug in cublas is fixed. + compute_type_ = CUDA_R_32F; + cublasGemmAlgo_t algo_ = CUBLAS_GEMM_DEFAULT; + status_t get_cublas_data_type( + const cudnnDataType_t &cudnn_dt, cudaDataType_t &blas_dt) const { + switch (cudnn_dt) { + case CUDNN_DATA_FLOAT: blas_dt = CUDA_R_32F; return status::success; + case CUDNN_DATA_HALF: blas_dt = CUDA_R_16F; return status::success; + case CUDNN_DATA_INT8: blas_dt = CUDA_R_8I; return status::success; + case CUDNN_DATA_INT8x4: blas_dt = CUDA_R_8I; return status::success; + default: return status::unimplemented; + } + return status::unimplemented; + } +}; + +struct cudnn_gemm_inner_product_fwd_impl_t + : public cudnn_inner_product_fwd_base_t, + public cudnn_gemm_inner_product_base_t, + public cudnn_conv_filter_adjustment_base_t { + + cudnnActivationDescriptor_t act_desc_; + bool use_acc_dst_; + cudnnTensorDescriptor_t y_acc_desc_; + bool need_reorder_; + + bool ip_using_scratchpad() const override { return (use_acc_dst_ > 0); } + virtual bool need_to_transform_filter() const { return need_reorder_; } + + virtual status_t init(engine_t *, inner_product_pd_t *pd, bool with_relu, + bool with_eltwise, bool with_sum, bool need_reorder) override { + need_reorder_ = need_reorder; + // GEMM is column major, here the data is row major. + // By switching the weight and source we convert the row major to + // column major without transposing matrices. + // B * A = C, where B is weight, A is src and C is dst + bool wie_tr = (pd->weights_md()->format_desc.blocking.strides[0] != 1); + CHECK(convert_data_type(pd->src_md(), &data_types_[io::src])); + CHECK(convert_data_type(pd->weights_md(0), &data_types_[io::wei])); + if (need_reorder) { + cudnnTensorFormat_t source_format; + CHECK(get_format(pd->src_md(), source_format)); + ndims_ = pd->ndims() < 4 ? 4 : pd->ndims(); + get_4d_tensor_descriptor( + pd->weights_md(0), dims_[io::wei], strides_[io::wei]); + set_filter_format( + ndims_, dims_[io::wei], strides_[NUM_IO], source_format); + CHECK(init_filter_transformation(data_types_[io::wei], ndims_, + dims_[io::wei], strides_[io::wei], strides_[NUM_IO])); + + pd->scratchpad_registry().registrar().book( + memory_tracking::names::key_none, + memory_desc_wrapper(pd->weights_md(0)).size(), size_t(1)); + wie_tr = strides_[NUM_IO][0] != 1; + } + + trans_a_ = wie_tr ? CUBLAS_OP_T : CUBLAS_OP_N; + trans_b_ = CUBLAS_OP_N; + int ic = pd->IC_total_padded(); + int oc = pd->OC(); + int mb = pd->MB(); + n_ = mb; + k_ = ic; + m_ = oc; + lda_ = wie_tr ? k_ : m_; + ldb_ = k_; + ldc_ = m_; + with_bias_ = pd->with_bias(); + with_eltwise_ = with_eltwise || with_relu; + with_relu_ = with_eltwise; + use_acc_dst_ = ((pd->dst_md()->data_type == data_type::s8) + || (with_bias_ + && pd->weights_md(1)->data_type + != pd->dst_md()->data_type)); + // this must be applied on bias if exists. + output_scales_ = pd->attr()->output_scales_.scales_[0]; // alpha + with_sum_ = with_sum; + // scaling factor to add the previous destination value to the current + // computation. This is equivalent of + sum_scale_ = sum_scale(pd); + ndims_ = 4; + + bool input_is_blocked + = pd->src_md()->format_desc.blocking.inner_blks[0] == 4 + && pd->weights_md(0)->format_desc.blocking.inner_blks[0] == 4; + if (input_is_blocked) { // since we flatten the tensor and use gemm + // we dont care about the blocked data type + data_types_[io::src] = CUDNN_DATA_INT8; + data_types_[io::wei] = CUDNN_DATA_INT8; + data_types_[io::dst] = CUDNN_DATA_INT8; + } else { + CHECK(convert_data_type(pd->dst_md(), &data_types_[io::dst])); + } + CHECK(get_cublas_data_type(data_types_[io::wei], a_type_)); + CHECK(get_cublas_data_type(data_types_[io::src], b_type_)); + + c_type_ = (data_types_[io::dst] == CUDNN_DATA_HALF && !use_acc_dst_) + ? CUDA_R_16F + : CUDA_R_32F; + get_4d_tensor_descriptor( + pd->dst_md(), dims_[io::dst], strides_[io::dst]); + + CHECK(create_and_set_tensor_descriptor(&tensor_descs_[io::dst], + data_types_[io::dst], ndims_, dims_[io::dst], + strides_[io::dst])); + + if (with_bias_) { + CHECK(convert_data_type(pd->weights_md(1), &data_types_[io::bia])); + // format is always nchw + set_bias_dims(CUDNN_TENSOR_NCHW, ndims_, pd->OC()); + + CHECK(create_and_set_tensor_descriptor(&tensor_descs_[io::bia], + data_types_[io::bia], ndims_, dims_[io::bia], + strides_[io::bia])); + } + if (use_acc_dst_) { + pd->scratchpad_registry().registrar().book( + memory_tracking::names::key_iprod_int_dat_in_acc_dt, + memory_desc_wrapper(pd->dst_md()).size(), size_t(1)); + CHECK(create_and_set_tensor_descriptor(&y_acc_desc_, + CUDNN_DATA_FLOAT, ndims_, dims_[io::dst], + strides_[io::dst])); + } else { + y_acc_desc_ = tensor_descs_[io::dst]; + } + if (with_eltwise_) { CHECK(create_and_set_op_descriptor(pd)); } + return status::success; + } + + void execute(cudnnHandle_t cudnn_handle, cublasHandle_t cublas_handle, + const std::vector &args) const override { + assert(args.size() == 7); + auto x = args[0], w = args[1], b = args[2], y = args[3], + workspace = args[4]; + auto w_arg = w; + if (need_reorder_) { + void *transformed_w = args[5]; + transform_filter(cudnn_handle, w, transformed_w); + w_arg = transformed_w; + } + auto y_dst = use_acc_dst_ ? workspace : y; + auto sum_scale = use_acc_dst_ ? 0.0f : sum_scale_; + // do gemm + CUBLAS_EXECUTE_FUNC(cublasGemmEx, cublas_handle, trans_a_, trans_b_, m_, + n_, k_, &output_scales_, w_arg, a_type_, lda_, x, b_type_, ldb_, + &sum_scale, y_dst, c_type_, ldc_, compute_type_, algo_); + + if (with_bias_) { + + CUDNN_EXECUTE_FUNC(cudnnAddTensor, cudnn_handle, &output_scales_, + tensor_descs_[io::bia], b, &alpha_, y_acc_desc_, y_dst); + } + if (use_acc_dst_) { + CUDNN_EXECUTE_FUNC(cudnnTransformTensor, cudnn_handle, &alpha_, + y_acc_desc_, y_dst, &sum_scale_, tensor_descs_[io::dst], y); + } + if (with_eltwise_) { + CUDNN_EXECUTE_FUNC(cudnnActivationForward, cudnn_handle, act_desc_, + &alpha_, tensor_descs_[io::dst], y, &beta_, + tensor_descs_[io::dst], y); + } + } + + status_t create_and_set_op_descriptor(const inner_product_pd_t *pd) { + + CHECK(CUDNN_EXECUTE_FUNC_S( + cudnnCreateActivationDescriptor, &act_desc_)); + + cudnnActivationMode_t act_mode; + switch (eltwise_algorithm_kind(pd)) { + case alg_kind::eltwise_tanh: + act_mode = CUDNN_ACTIVATION_TANH; + break; + case alg_kind::eltwise_elu: act_mode = CUDNN_ACTIVATION_ELU; break; + case alg_kind::eltwise_relu: + act_mode = CUDNN_ACTIVATION_RELU; + break; + case alg_kind::eltwise_logistic: + act_mode = CUDNN_ACTIVATION_SIGMOID; + break; + case alg_kind::eltwise_bounded_relu: + act_mode = CUDNN_ACTIVATION_CLIPPED_RELU; + break; + default: return status::unimplemented; + } + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetActivationDescriptor, act_desc_, + act_mode, cudnnNanPropagation_t::CUDNN_NOT_PROPAGATE_NAN, + eltwise_alpha(pd))); + + return status::success; + } +}; + +struct cudnn_gemm_inner_product_bwd_data_impl_t + : public cudnn_inner_product_impl_base_t, + public cudnn_gemm_inner_product_base_t, + public cudnn_conv_filter_adjustment_base_t { + bool need_reorder_; + + virtual bool need_to_transform_filter() const { return need_reorder_; } + + virtual status_t init(engine_t *, inner_product_pd_t *pd, + bool /*with_relu*/, bool /*with_eltwise*/, bool /*with_sum */, + bool need_reorder) override { + need_reorder_ = need_reorder; + + // GEMM is column major, here the data is row major. + // By switching the weight and source we convert the row major to + // column major without transposing matrices. + // B * A = C, where B is weight, A is d_dst and C is d_src + bool wie_tr = (pd->weights_md(0)->format_desc.blocking.strides[0] == 1); + CHECK(convert_data_type(pd->diff_src_md(), &data_types_[io::src])); + CHECK(convert_data_type(pd->weights_md(0), &data_types_[io::wei])); + CHECK(convert_data_type(pd->diff_dst_md(), &data_types_[io::dst])); + if (need_reorder) { + cudnnTensorFormat_t diff_source_format_; + CHECK(get_format(pd->diff_src_md(), diff_source_format_)); + ndims_ = pd->ndims() < 4 ? 4 : pd->ndims(); + get_4d_tensor_descriptor( + pd->weights_md(0), dims_[io::wei], strides_[io::wei]); + set_filter_format(ndims_, dims_[io::wei], strides_[NUM_IO], + diff_source_format_); + CHECK(init_filter_transformation(data_types_[io::wei], ndims_, + dims_[io::wei], strides_[io::wei], strides_[NUM_IO])); + + pd->scratchpad_registry().registrar().book( + memory_tracking::names::key_none, + memory_desc_wrapper(pd->weights_md(0)).size(), size_t(1)); + wie_tr = strides_[NUM_IO][0] == 1; + } + trans_a_ = wie_tr ? CUBLAS_OP_T : CUBLAS_OP_N; + trans_b_ = CUBLAS_OP_N; + int ic = pd->IC_total_padded(); + int oc = pd->OC(); + int mb = pd->MB(); + n_ = mb; + k_ = oc; + m_ = ic; + lda_ = wie_tr ? k_ : m_; + ldb_ = k_; + ldc_ = m_; + CHECK(get_cublas_data_type(data_types_[io::wei], a_type_)); + CHECK(get_cublas_data_type(data_types_[io::dst], b_type_)); + CHECK(get_cublas_data_type(data_types_[io::src], c_type_)); + return status::success; + } + void execute(cudnnHandle_t cudnn_handle, cublasHandle_t cublas_handle, + const std::vector &args) const override { + assert(args.size() == 5); + auto dx = args[0], w = args[1], dy = args[2]; + auto w_arg = w; + if (need_reorder_) { + void *transformed_w = args[4]; + transform_filter(cudnn_handle, w, transformed_w); + w_arg = transformed_w; + } + // do gemm + CUBLAS_EXECUTE_FUNC(cublasGemmEx, cublas_handle, trans_a_, trans_b_, m_, + n_, k_, &alpha_, w_arg, a_type_, lda_, dy, b_type_, ldb_, + &beta_, dx, c_type_, ldc_, compute_type_, algo_); + } +}; + +struct cudnn_gemm_inner_product_bwd_weights_impl_t + : public cudnn_inner_product_impl_base_t, + public cudnn_gemm_inner_product_base_t, + public cudnn_conv_filter_adjustment_base_t { + cudnnReduceTensorDescriptor_t reduceTensorDesc_ = nullptr; + bool wie_tr_; + bool need_reorder_; + + virtual bool need_to_transform_filter() const { return need_reorder_; } + + virtual ~cudnn_gemm_inner_product_bwd_weights_impl_t() { + if (reduceTensorDesc_) { + CUDNN_EXECUTE_FUNC_V( + cudnnDestroyReduceTensorDescriptor, reduceTensorDesc_); + } + } + status_t create_and_set_reduce_descriptor() { + CUDNN_EXECUTE_FUNC_S( + cudnnCreateReduceTensorDescriptor, &reduceTensorDesc_); + CUDNN_EXECUTE_FUNC_S(cudnnSetReduceTensorDescriptor, reduceTensorDesc_, + CUDNN_REDUCE_TENSOR_ADD, CUDNN_DATA_FLOAT, CUDNN_PROPAGATE_NAN, + CUDNN_REDUCE_TENSOR_NO_INDICES, CUDNN_32BIT_INDICES); + return status::success; + } + virtual status_t init(engine_t *engine, inner_product_pd_t *pd, + bool /*with_relu*/, bool /*with_eltwise*/, bool /*with_sum */, + bool need_reorder) override { + need_reorder_ = need_reorder; + with_bias_ = pd->with_bias(); + + // GEMM is column major, here the data is row major. + // By switching the weight and source we convert the row major to + // column major without transposing matrices. + // B * A = C. + // Here backward weight is equivalent of d_dst * src^T when the weight + // filter is IC*OC. Therefore B is d_dst and A is transposed src, and C + // is d_wei. However, when the filter format is OC*IC , the backward + // weight is equivalent to src * d_dst^T. In this case, B is src, A is + // transposed d_dst and C is d_wei. + wie_tr_ = (pd->diff_weights_md(0)->format_desc.blocking.strides[0] + == 1); + // std::cout << wie_tr_ << std::endl; + CHECK(convert_data_type(pd->src_md(), &data_types_[io::src])); + CHECK(convert_data_type(pd->diff_weights_md(0), &data_types_[io::wei])); + CHECK(convert_data_type(pd->diff_dst_md(), &data_types_[io::dst])); + if (need_reorder_) { + cudnnTensorFormat_t source_format; + CHECK(get_format(pd->src_md(), source_format)); + ndims_ = pd->ndims() < 4 ? 4 : pd->ndims(); + get_4d_tensor_descriptor( + pd->diff_weights_md(0), dims_[io::wei], strides_[io::wei]); + set_filter_format( + ndims_, dims_[io::wei], strides_[NUM_IO], source_format); + CHECK(init_filter_transformation(data_types_[io::wei], ndims_, + dims_[io::wei], strides_[NUM_IO], strides_[io::wei])); + pd->scratchpad_registry().registrar().book( + memory_tracking::names::key_none, + memory_desc_wrapper(pd->diff_weights_md(0)).size(), + size_t(1)); + wie_tr_ = (strides_[NUM_IO][0] == 1); + } + trans_a_ = CUBLAS_OP_N; + trans_b_ = CUBLAS_OP_T; + int ic = pd->IC_total_padded(); + int oc = pd->OC(); + int mb = pd->MB(); + n_ = wie_tr_ ? ic : oc; + k_ = mb; + m_ = wie_tr_ ? oc : ic; + lda_ = m_; + ldb_ = n_; + ldc_ = m_; + + CHECK(get_cublas_data_type( + data_types_[(wie_tr_ ? io::dst : io::src)], a_type_)); + CHECK(get_cublas_data_type( + data_types_[(wie_tr_ ? io::src : io::dst)], b_type_)); + CHECK(get_cublas_data_type(data_types_[io::wei], c_type_)); + if (with_bias_) { + ndims_ = 4; + get_4d_tensor_descriptor( + pd->diff_dst_md(), dims_[io::dst], strides_[io::dst]); + CHECK(convert_data_type(pd->diff_dst_md(), &data_types_[io::dst])); + set_bias_dims(CUDNN_TENSOR_NCHW, ndims_, pd->OC()); + CHECK(convert_data_type( + pd->diff_weights_md(1), &data_types_[io::bia])); + CHECK(create_and_set_tensor_descriptor(&tensor_descs_[io::dst], + data_types_[io::dst], ndims_, dims_[io::dst], + strides_[io::dst])); + CHECK(create_and_set_tensor_descriptor(&tensor_descs_[io::bia], + data_types_[io::bia], ndims_, dims_[io::bia], + strides_[io::bia])); + CHECK(create_and_set_reduce_descriptor()); + + auto &sycl_engine = *utils::downcast(engine); + stream_t *service_stream; + CHECK(sycl_engine.get_service_stream(service_stream)); + + auto cuda_stream + = utils::downcast(service_stream); + auto handle = cuda_stream->get_cudnn_handle(); + + // get the required workspace size + CUDNN_EXECUTE_FUNC_S(cudnnGetReductionWorkspaceSize, handle, + reduceTensorDesc_, tensor_descs_[io::dst], + tensor_descs_[io::bia], &workspace_size_); + } + + if (workspace_size_ > 0) { + pd->scratchpad_registry().registrar().book( + memory_tracking::names::key_iprod_int_dat_in_acc_dt, + workspace_size_, size_t(1)); + } + + return status::success; + } + void execute(cudnnHandle_t cudnn_handle, cublasHandle_t cublas_handle, + const std::vector &args) const override { + assert(args.size() == 6); + auto x = args[0], dy = args[1], dw = args[2], db = args[3], + workspace = args[4]; + auto dw_arg = need_reorder_ ? args[5] : dw; + // do gemm + CUBLAS_EXECUTE_FUNC(cublasGemmEx, cublas_handle, trans_a_, trans_b_, m_, + n_, k_, &alpha_, (wie_tr_ ? dy : x), a_type_, lda_, + (wie_tr_ ? x : dy), b_type_, ldb_, &beta_, dw_arg, c_type_, + ldc_, compute_type_, algo_); + + if (need_reorder_) { + // The output of weight is in nvida specific format, + // however a user requires the oneDNN format as an output + transform_filter(cudnn_handle, dw_arg, dw); + } + if (with_bias_) { + + // backward bias for inner product is reduction of dy on dim[0] . + // So we can use cudnnReduceTensor to partially reduce dy. + CUDNN_EXECUTE_FUNC(cudnnReduceTensor, cudnn_handle, + reduceTensorDesc_, nullptr, 0, workspace, workspace_size_, + &alpha_, tensor_descs_[io::dst], dy, &beta_, + tensor_descs_[io::bia], db); + } + } +}; + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/nvidia/cudnn_inner_product.cpp b/src/gpu/nvidia/cudnn_inner_product.cpp new file mode 100644 index 00000000000..85d5f07f90e --- /dev/null +++ b/src/gpu/nvidia/cudnn_inner_product.cpp @@ -0,0 +1,238 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "gpu/nvidia/cudnn_inner_product.hpp" +#include "gpu/nvidia/cudnn_conv_inner_product.hpp" +#include "gpu/nvidia/cudnn_gemm_inner_product.hpp" +#include "gpu/nvidia/sycl_cuda_scoped_context.hpp" +#include "gpu/nvidia/sycl_cuda_stream.hpp" +#include "sycl/sycl_buffer_memory_storage.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +status_t cudnn_inner_product_fwd_t::execute(const exec_ctx_t &ctx) const { + if (pd()->has_zero_dim_memory()) return status::success; + + nvidia::sycl_cuda_stream_t *cuda_stream + = utils::downcast(ctx.stream()); + + return cuda_stream->interop_task([&](cl::sycl::handler &cgh) { + using scratch_acc_t = cl::sycl::accessor; + using read_acc_t + = cl::sycl::accessor; + auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC); + auto wei_acc = CTX_IN_ACCESSOR(DNNL_ARG_WEIGHTS); + std::shared_ptr bias_acc; + if (pd()->with_bias()) { + bias_acc = std::make_shared( + CTX_IN_ACCESSOR(DNNL_ARG_BIAS)); + } + auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST); + std::shared_ptr ip_scratch_acc; + std::shared_ptr spacial_scratch_acc; + std::shared_ptr scaled_bias_scratch_acc; + if (pd()->inner_product_impl_->ip_using_scratchpad()) { + ip_scratch_acc = std::make_shared< + scratch_acc_t>(CTX_SCRATCH_ACCESSOR( + memory_tracking::names::key_iprod_int_dat_in_acc_dt)); + } + if (pd()->inner_product_impl_->need_to_transform_filter()) { + spacial_scratch_acc = std::make_shared( + CTX_SCRATCH_ACCESSOR(memory_tracking::names::key_none)); + } + if (pd()->inner_product_impl_->conv_using_scale_scratchpad()) { + scaled_bias_scratch_acc + = std::make_shared(CTX_SCRATCH_ACCESSOR( + memory_tracking::names::key_conv_adjusted_scales)); + } + cgh.interop_task([=](const cl::sycl::interop_handler &ih) { + auto &sycl_engine = *utils::downcast( + cuda_stream->engine()); + auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine); + auto cudnn_handle = cuda_stream->get_cudnn_handle(); + auto cublas_handle = cuda_stream->get_cublas_handle(); + + std::vector args; + + args.push_back(sc.memory(ih, src_acc)); + args.push_back(sc.memory(ih, wei_acc)); + args.push_back( + ((pd()->with_bias()) ? sc.memory(ih, *bias_acc) + : nullptr)); + args.push_back(sc.memory(ih, dst_acc)); + args.push_back((pd()->inner_product_impl_->ip_using_scratchpad() + ? sc.memory(ih, *ip_scratch_acc) + : nullptr)); + args.push_back(( + pd()->inner_product_impl_->need_to_transform_filter() + ? sc.memory(ih, *spacial_scratch_acc) + : nullptr)); + args.push_back(( + pd()->inner_product_impl_->conv_using_scale_scratchpad() + ? sc.memory(ih, *scaled_bias_scratch_acc) + : nullptr)); + pd()->inner_product_impl_->execute( + cudnn_handle, cublas_handle, args); + }); + }); +} + +status_t cudnn_inner_product_bwd_data_t::execute(const exec_ctx_t &ctx) const { + if (pd()->has_zero_dim_memory()) return status::success; + nvidia::sycl_cuda_stream_t *cuda_stream + = utils::downcast(ctx.stream()); + + return cuda_stream->interop_task([&](cl::sycl::handler &cgh) { + using scratch_acc_t = cl::sycl::accessor; + auto diff_dst_acc = CTX_IN_ACCESSOR(DNNL_ARG_DIFF_DST); + auto wei_acc = CTX_IN_ACCESSOR(DNNL_ARG_WEIGHTS); + auto diff_src_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_SRC); + std::shared_ptr ip_scratch_acc; + std::shared_ptr spacial_scratch_acc; + if (pd()->inner_product_impl_->ip_using_scratchpad()) { + ip_scratch_acc = std::make_shared< + scratch_acc_t>(CTX_SCRATCH_ACCESSOR( + memory_tracking::names::key_iprod_int_dat_in_acc_dt)); + } + if (pd()->inner_product_impl_->need_to_transform_filter()) { + spacial_scratch_acc = std::make_shared( + CTX_SCRATCH_ACCESSOR(memory_tracking::names::key_none)); + } + cgh.interop_task([=](const cl::sycl::interop_handler &ih) { + auto &sycl_engine = *utils::downcast( + cuda_stream->engine()); + auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine); + auto cudnn_handle = cuda_stream->get_cudnn_handle(); + auto cublas_handle = cuda_stream->get_cublas_handle(); + + std::vector args; + + args.push_back(sc.memory(ih, diff_src_acc)); + args.push_back(sc.memory(ih, wei_acc)); + args.push_back(sc.memory(ih, diff_dst_acc)); + args.push_back((pd()->inner_product_impl_->ip_using_scratchpad() + ? sc.memory(ih, *ip_scratch_acc) + : nullptr)); + args.push_back(( + pd()->inner_product_impl_->need_to_transform_filter() + ? sc.memory(ih, *spacial_scratch_acc) + : nullptr)); + pd()->inner_product_impl_->execute( + cudnn_handle, cublas_handle, args); + }); + }); +} + +status_t cudnn_inner_product_bwd_weights_t::execute( + const exec_ctx_t &ctx) const { + + nvidia::sycl_cuda_stream_t *cuda_stream + = utils::downcast(ctx.stream()); + + if (pd()->has_zero_dim_memory()) { + auto wei_sz = memory_desc_wrapper(pd()->diff_weights_md(0)).size(); + size_t bias_sz = (pd()->with_bias() + ? memory_desc_wrapper(pd()->diff_weights_md(1)).size() + : 0); + + if (wei_sz != 0) { + auto status + = cuda_stream->interop_task([&](cl::sycl::handler &cgh) { + auto diff_wei_acc + = CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_WEIGHTS); + cgh.fill(diff_wei_acc, static_cast(0)); + }); + if (status != status::success) return status; + } + if (bias_sz != 0) { + auto status + = cuda_stream->interop_task([&](cl::sycl::handler &cgh) { + auto diff_bia_acc + = CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_BIAS); + cgh.fill(diff_bia_acc, static_cast(0)); + }); + if (status != status::success) return status; + } + return status::success; + } + + return cuda_stream->interop_task([&](cl::sycl::handler &cgh) { + using scratch_acc_t = cl::sycl::accessor; + auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC); + auto diff_dst_acc = CTX_IN_ACCESSOR(DNNL_ARG_DIFF_DST); + auto diff_wei_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_WEIGHTS); + using write_acc_t + = cl::sycl::accessor; + std::shared_ptr diff_bias_acc; + if (pd()->with_bias()) { + diff_bias_acc = std::make_shared( + CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_BIAS)); + } + std::shared_ptr ip_scratch_acc; + std::shared_ptr spacial_scratch_acc; + if (pd()->inner_product_impl_->ip_using_scratchpad()) { + ip_scratch_acc = std::make_shared< + scratch_acc_t>(CTX_SCRATCH_ACCESSOR( + memory_tracking::names::key_iprod_int_dat_in_acc_dt)); + } + if (pd()->inner_product_impl_->need_to_transform_filter()) { + spacial_scratch_acc = std::make_shared( + CTX_SCRATCH_ACCESSOR(memory_tracking::names::key_none)); + } + cgh.interop_task([=](const cl::sycl::interop_handler &ih) { + auto &sycl_engine = *utils::downcast( + cuda_stream->engine()); + auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine); + auto cudnn_handle = cuda_stream->get_cudnn_handle(); + auto cublas_handle = cuda_stream->get_cublas_handle(); + std::vector args; + + args.push_back(sc.memory(ih, src_acc)); + args.push_back(sc.memory(ih, diff_dst_acc)); + args.push_back(sc.memory(ih, diff_wei_acc)); + args.push_back( + ((pd()->with_bias()) ? sc.memory(ih, *diff_bias_acc) + : nullptr)); + + args.push_back((pd()->inner_product_impl_->ip_using_scratchpad() + ? sc.memory(ih, *ip_scratch_acc) + : nullptr)); + args.push_back(( + pd()->inner_product_impl_->need_to_transform_filter() + ? sc.memory(ih, *spacial_scratch_acc) + : nullptr)); + pd()->inner_product_impl_->execute( + cudnn_handle, cublas_handle, args); + }); + }); +} + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl diff --git a/src/gpu/nvidia/cudnn_inner_product.hpp b/src/gpu/nvidia/cudnn_inner_product.hpp new file mode 100644 index 00000000000..e95d050e003 --- /dev/null +++ b/src/gpu/nvidia/cudnn_inner_product.hpp @@ -0,0 +1,90 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_NVIDIA_CUDNN_INNER_PRODUCT_HPP +#define GPU_NVIDIA_CUDNN_INNER_PRODUCT_HPP + +#include "cudnn.h" + +#include + +#include "common/c_types_map.hpp" +#include "common/inner_product_pd.hpp" +#include "common/primitive.hpp" +#include "gpu/nvidia/cudnn_inner_product_impl.hpp" +#include "gpu/nvidia/sycl_cuda_engine.hpp" +#include "gpu/nvidia/sycl_cuda_utils.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +struct cudnn_inner_product_fwd_t : public primitive_t { +public: + using primitive_t::primitive_t; + + struct pd_t : public inner_product_fwd_pd_t { + using inner_product_fwd_pd_t::inner_product_fwd_pd_t; + + std::shared_ptr inner_product_impl_; + }; + + status_t execute(const exec_ctx_t &ctx) const override; + virtual const pd_t *pd() const { + return (const pd_t *)primitive_t::pd().get(); + } +}; + +struct cudnn_inner_product_bwd_data_t : public primitive_t { +public: + using primitive_t::primitive_t; + + struct pd_t : public inner_product_bwd_data_pd_t { + using inner_product_bwd_data_pd_t::inner_product_bwd_data_pd_t; + + std::shared_ptr inner_product_impl_; + }; + + status_t execute(const exec_ctx_t &ctx) const override; + virtual const pd_t *pd() const { + return (const pd_t *)primitive_t::pd().get(); + } +}; + +struct cudnn_inner_product_bwd_weights_t : public primitive_t { +public: + using primitive_t::primitive_t; + struct pd_t : public inner_product_bwd_weights_pd_t { + using inner_product_bwd_weights_pd_t::inner_product_bwd_weights_pd_t; + + std::shared_ptr inner_product_impl_; + }; + + status_t execute(const exec_ctx_t &ctx) const override; + + virtual const pd_t *pd() const { + return (const pd_t *)primitive_t::pd().get(); + } +}; + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/nvidia/cudnn_inner_product_impl.hpp b/src/gpu/nvidia/cudnn_inner_product_impl.hpp new file mode 100644 index 00000000000..d53de509a29 --- /dev/null +++ b/src/gpu/nvidia/cudnn_inner_product_impl.hpp @@ -0,0 +1,191 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_NVIDIA_CUDNN_INNER_PRODUCT_IMPL_HPP +#define GPU_NVIDIA_CUDNN_INNER_PRODUCT_IMPL_HPP + +#include "cublas_v2.h" +#include "cudnn.h" + +#include "common/type_helpers.hpp" +#include "gpu/nvidia/sycl_cuda_engine.hpp" +#include "gpu/nvidia/sycl_cuda_utils.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { +namespace { +inline void get_4d_tensor_descriptor( + const memory_desc_t *mem_desc1, int *dims, int *strides) { + memory_desc_t mem_desc = *mem_desc1; + + // Forcing tensors dims less than 4 to be 4 {n c h w}; + using namespace format_tag; + auto set_dim = [&]() { + if (mem_desc.ndims == 3) { + mem_desc.ndims = 4; + mem_desc.dims[3] = mem_desc.dims[2]; + mem_desc.dims[2] = 1; + mem_desc.padded_dims[3] = mem_desc.padded_dims[2]; + mem_desc.padded_dims[2] = 1; + } else if (mem_desc.ndims == 2) { + mem_desc.ndims = 4; + mem_desc.dims[3] = 1; + mem_desc.dims[2] = 1; + mem_desc.padded_dims[3] = 1; + mem_desc.padded_dims[2] = 1; + } + }; + auto &stride = mem_desc.format_desc.blocking.strides; + auto &dim = mem_desc.dims; + // Forcing strides < 4 to be 4 + if (memory_desc_matches_tag(mem_desc, nwc)) { + set_dim(); + // promoting nwc(owi) to NHWC = {wc 1 c} to {wc 1 wc c} + mem_desc.format_desc.blocking.strides[3] + = mem_desc.format_desc.blocking.strides[2]; + mem_desc.format_desc.blocking.strides[2] + = mem_desc.format_desc.blocking.strides[0]; + assert(memory_desc_matches_tag(mem_desc, nhwc) + && "Tag is not set to NHWC"); + } else if (memory_desc_matches_tag(mem_desc, ncw)) { + set_dim(); + // promoting ncw(oiw) to NCHW = {wc w 1} to {wc w w 1} + mem_desc.format_desc.blocking.strides[3] + = mem_desc.format_desc.blocking.strides[2]; + mem_desc.format_desc.blocking.strides[2] + = mem_desc.format_desc.blocking.strides[1]; + assert(memory_desc_matches_tag(mem_desc, nchw) + && "Tag is not set to NCHW"); + } else if (memory_desc_matches_tag(mem_desc, wio)) { + set_dim(); + // promoting wcn(wio) to HWCN = {1 n nc} to {1 n ncw nc} + mem_desc.format_desc.blocking.strides[3] + = mem_desc.format_desc.blocking.strides[2]; + mem_desc.format_desc.blocking.strides[2] *= mem_desc.dims[3]; + assert(memory_desc_matches_tag(mem_desc, hwio) + && " Tag is not set to HWIO"); + } else if (memory_desc_matches_tag(mem_desc, nc)) { + set_dim(); + // fixing strides + // promoting nc(oi) to NCHW = {c 1} to {c 1 1 1} + mem_desc.format_desc.blocking.strides[2] + = mem_desc.format_desc.blocking.strides[1]; + mem_desc.format_desc.blocking.strides[3] + = mem_desc.format_desc.blocking.strides[1]; + assert(memory_desc_matches_tag(mem_desc, nchw) + && " Tag is not set to NCHW"); + } else if (memory_desc_matches_tag(mem_desc, cn)) { + set_dim(); + // fixing strides cn(oi) to HWCN = {1 n} to {1 n nc nc}. + // Note that CHWN exists as well, but for inner product + // we convert it to HWCN. Other primitives may need + // different conversion. + mem_desc.format_desc.blocking.strides[2] + = mem_desc.format_desc.blocking.strides[1] + * mem_desc.padded_dims[1]; + mem_desc.format_desc.blocking.strides[3] + = mem_desc.format_desc.blocking.strides[2]; + assert(memory_desc_matches_tag(mem_desc, hwio) + && " Tag is not set to NCHW"); + } + convert_dnnl_dims_array(mem_desc.dims, dims, mem_desc.ndims); + convert_dnnl_dims_array( + mem_desc.format_desc.blocking.strides, strides, mem_desc.ndims); +} +} // namespace +struct cudnn_inner_product_impl_base_t { + // The io enum requires the weights be the last parameter to ensure + // tensor_descs is contiguous. + enum io { src = 0, bia, dst, wei, NUM_IO }; + cudnnDataType_t data_types_[NUM_IO + 1]; // +1 data-type for accumulation + int ndims_; + int dims_[NUM_IO][DNNL_MAX_NDIMS]; + // one extra stride added for transform filter + int strides_[NUM_IO + 1][DNNL_MAX_NDIMS]; + + cudnnTensorDescriptor_t tensor_descs_[NUM_IO - 1] = {}; + + size_t workspace_size_ = 0; + float alpha_ = 1, beta_ = 0; + bool with_bias_; + bool scale_bias_ = false; + bool with_relu_ = false, with_eltwise_ = false, with_sum_ = false; + bool filter_using_spatial_format_ = false; + + virtual bool need_to_transform_filter() const { + return filter_using_spatial_format_; + } + + virtual bool ip_using_scratchpad() const { return (workspace_size_ > 0); } + bool conv_using_scale_scratchpad() const { return scale_bias_; } + + void set_bias_dims(cudnnTensorFormat_t format, int ndims, int bias_dim) { + // Set the dimensions and strides for the bias. + // Note that the second dimension of bias and the first dimension + // of filter should be equal, as cuDNN always stores dimensions in + // NCDHW order. The first dimension of filter must be equal to the + // second dimension of bias + for (size_t i = 0; i < ndims; ++i) { + dims_[io::bia][i] = 1; + strides_[io::bia][i] = (format != CUDNN_TENSOR_NHWC ? 1 : bias_dim); + } + dims_[io::bia][1] = bias_dim; + strides_[io::bia][1] = 1; + strides_[io::bia][0] = bias_dim; + } + virtual status_t init(engine_t * /*engine*/, inner_product_pd_t * /*pd*/, + bool /*with_relu*/, bool /*with_eltwise*/, bool /*with_sum */, + bool /*using_fused_path_for_blocking*/) + = 0; + + virtual void execute(cudnnHandle_t /*handle*/, + cublasHandle_t /*cublas_handle*/, + const std::vector & /*args*/) const = 0; +}; + +struct cudnn_inner_product_fwd_base_t : public cudnn_inner_product_impl_base_t { + float output_scales_; // alpha in gemm + float sum_scale_; // beta in gemm + float eltwise_alpha(const inner_product_pd_t *pd) const { + const int eltwise_idx + = pd->attr()->post_ops_.find(primitive_kind::eltwise); + return with_eltwise_ + ? pd->attr()->post_ops_.entry_[eltwise_idx].eltwise.alpha + : 0.0f; + } + float sum_scale(const inner_product_pd_t *pd) const { + const int sum_idx = pd->attr()->post_ops_.find(primitive_kind::sum); + return with_sum_ ? pd->attr()->post_ops_.entry_[sum_idx].sum.scale + : 0.0f; + } + + dnnl::impl::alg_kind_t eltwise_algorithm_kind( + const inner_product_pd_t *pd) const { + const int eltwise_idx + = pd->attr()->post_ops_.find(primitive_kind::eltwise); + return pd->attr()->post_ops_.entry_[eltwise_idx].eltwise.alg; + } +}; + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/nvidia/cudnn_lrn.cpp b/src/gpu/nvidia/cudnn_lrn.cpp new file mode 100644 index 00000000000..568bbc19e0d --- /dev/null +++ b/src/gpu/nvidia/cudnn_lrn.cpp @@ -0,0 +1,89 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "gpu/nvidia/cudnn_lrn.hpp" +#include "gpu/nvidia/sycl_cuda_scoped_context.hpp" +#include "gpu/nvidia/sycl_cuda_stream.hpp" +#include "sycl/sycl_buffer_memory_storage.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +status_t cudnn_lrn_fwd_t::execute(const exec_ctx_t &ctx) const { + + if (memory_desc_wrapper(pd()->desc()->data_desc).has_zero_dim()) + return status::success; + + nvidia::sycl_cuda_stream_t *cuda_stream + = utils::downcast(ctx.stream()); + + return cuda_stream->interop_task([&](cl::sycl::handler &cgh) { + auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC); + auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST); + auto wrksp_acc = pd()->is_training() + ? CTX_OUT_ACCESSOR(DNNL_ARG_WORKSPACE) + : dst_acc; + + cgh.interop_task([=](const cl::sycl::interop_handler &ih) { + auto &sycl_engine = *utils::downcast( + cuda_stream->engine()); + auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine); + auto handle = cuda_stream->get_cudnn_handle(); + std::vector args {sc.memory(ih, src_acc), + sc.memory(ih, dst_acc), + sc.memory(ih, wrksp_acc)}; + pd()->lrn_impl_->execute(handle, args); + }); + }); +} + +status_t cudnn_lrn_bwd_t::execute(const exec_ctx_t &ctx) const { + if (memory_desc_wrapper(pd()->desc()->data_desc).has_zero_dim()) + return status::success; + + nvidia::sycl_cuda_stream_t *cuda_stream + = utils::downcast(ctx.stream()); + + return cuda_stream->interop_task([&](cl::sycl::handler &cgh) { + auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC); + auto diff_dst_acc = CTX_IN_ACCESSOR(DNNL_ARG_DIFF_DST); + auto diff_src_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_SRC); + auto ws_acc = CTX_IN_ACCESSOR(DNNL_ARG_WORKSPACE); + + cgh.interop_task([=](const cl::sycl::interop_handler &ih) mutable { + std::vector args; + auto &sycl_engine = *utils::downcast( + cuda_stream->engine()); + auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine); + auto handle = cuda_stream->get_cudnn_handle(); + + args.push_back(sc.memory(ih, src_acc)); + args.push_back(sc.memory(ih, ws_acc)); + args.push_back(sc.memory(ih, diff_src_acc)); + args.push_back(sc.memory(ih, diff_dst_acc)); + + pd()->lrn_impl_->execute(handle, args); + }); + }); +} + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl diff --git a/src/gpu/nvidia/cudnn_lrn.hpp b/src/gpu/nvidia/cudnn_lrn.hpp new file mode 100644 index 00000000000..bd2d40c3c1c --- /dev/null +++ b/src/gpu/nvidia/cudnn_lrn.hpp @@ -0,0 +1,132 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_NVIDIA_CUDNN_LRN_HPP +#define GPU_NVIDIA_CUDNN_LRN_HPP + +#include "cudnn.h" + +#include + +#include "common/c_types_map.hpp" +#include "common/lrn_pd.hpp" +#include "common/primitive.hpp" +#include "gpu/nvidia/cudnn_lrn_impl.hpp" +#include "gpu/nvidia/sycl_cuda_engine.hpp" +#include "gpu/nvidia/sycl_cuda_utils.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +struct cudnn_lrn_fwd_t : public primitive_t { + + struct pd_t : public lrn_fwd_pd_t { + using lrn_fwd_pd_t::lrn_fwd_pd_t; + + DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_lrn_fwd_t); + + status_t init(engine_t *) { + using namespace data_type; + + bool ok = true && is_fwd() + && utils::one_of(desc()->prop_kind, + prop_kind::forward_inference, + prop_kind::forward_training) + && utils::one_of( + desc()->alg_kind, alg_kind::lrn_across_channels) + && utils::one_of(desc()->data_desc.data_type, f32, f16) + && attr()->has_default_values() + // Make sure local size is not even (issue #75) + && desc_.local_size % 2 + // lrn does not support blocking + && src_md()->format_desc.blocking.inner_nblks == 0; + if (!ok) return status::unimplemented; + + if (has_zero_dim_memory()) return status::success; + + if (is_training()) { ws_md_ = *dst_md(); } + + lrn_impl_.reset(new cudnn_lrn_fwd_impl_t()); + + return lrn_impl_->init(this); + } + + bool is_training() const { + return desc_.prop_kind == prop_kind::forward_training; + } + + std::shared_ptr lrn_impl_; + }; + + cudnn_lrn_fwd_t(const pd_t *apd) : primitive_t(apd) {} + + status_t execute(const exec_ctx_t &ctx) const override; + +private: + const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } +}; + +struct cudnn_lrn_bwd_t : public primitive_t { + + struct pd_t : public lrn_bwd_pd_t { + using lrn_bwd_pd_t::lrn_bwd_pd_t; + + DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_lrn_bwd_t); + + status_t init(engine_t *) { + bool ok = true && !is_fwd() + && utils::one_of( + desc()->alg_kind, alg_kind::lrn_across_channels) + && utils::one_of(desc()->data_desc.data_type, + data_type::f16, data_type::f32) + && set_default_formats_common() + && attr()->has_default_values() + && desc_.local_size + % 2 // Make sure local size is not even (issue #75) + // lrn does not support blocking + && src_md()->format_desc.blocking.inner_nblks == 0 + && diff_dst_md()->format_desc.blocking.inner_nblks == 0; + if (!ok) return status::unimplemented; + if (has_zero_dim_memory()) { return status::success; }; + + ws_md_ = *diff_dst_md(); + if (!compare_ws(hint_fwd_pd_)) return status::unimplemented; + + lrn_impl_.reset(new cudnn_lrn_bwd_impl_t()); + + return lrn_impl_->init(this); + } + + std::shared_ptr lrn_impl_; + }; + + cudnn_lrn_bwd_t(const pd_t *apd) : primitive_t(apd) {} + + status_t execute(const exec_ctx_t &ctx) const override; + +private: + const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } +}; + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/nvidia/cudnn_lrn_impl.hpp b/src/gpu/nvidia/cudnn_lrn_impl.hpp new file mode 100644 index 00000000000..364e9d3eaa5 --- /dev/null +++ b/src/gpu/nvidia/cudnn_lrn_impl.hpp @@ -0,0 +1,201 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_NVIDIA_CUDNN_LRN_IMPL_HPP +#define GPU_NVIDIA_CUDNN_LRN_IMPL_HPP + +#include "cudnn.h" + +#include "gpu/nvidia/sycl_cuda_utils.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +struct cudnn_lrn_impl_base_t { + + virtual ~cudnn_lrn_impl_base_t() { + if (lrn_desc) { + CUDNN_EXECUTE_FUNC_V(cudnnDestroyLRNDescriptor, lrn_desc); + } + for (size_t i = 0; i < NUM_IO; i++) { + if (tensor_descs[i]) { + CUDNN_EXECUTE_FUNC_V( + cudnnDestroyTensorDescriptor, tensor_descs[i]); + } + } + } + virtual status_t init(const lrn_pd_t *pd) = 0; + virtual void execute( + cudnnHandle_t handle, const std::vector &args) const = 0; + +protected: + enum io { src_idx = 0, dst_idx, d_src_idx, d_dst_idx, NUM_IO }; + cudnnDataType_t data_types[NUM_IO]; + int ndims; + int dst_size; + int dims[NUM_IO][DNNL_MAX_NDIMS]; + int strides[NUM_IO][DNNL_MAX_NDIMS]; + float alpha = 1.0f; + float beta = 0.0f; + bool is_training; + double lrn_alpha; + double lrn_beta; + double lrn_K; + unsigned int lrn_N; + cudnnLRNMode_t lrn_mode; + cudnnLRNDescriptor_t lrn_desc = nullptr; + cudnnTensorDescriptor_t tensor_descs[NUM_IO] = {}; + + virtual status_t init_common(const lrn_pd_t *pd) { + ndims = std::max(4, pd->ndims()); + if (ndims > 6) { return status::invalid_arguments; } + + const bool do_scaling + = pd->src_md()->data_type == dnnl_data_type_t::dnnl_s8; + const auto scales_0 = pd->attr()->scales_.get(1).scales_; + const auto lrn_desc = pd->desc(); + const auto dst_wrap = memory_desc_wrapper(pd->dst_md()); + + dst_size = dst_wrap.nelems(); + alpha = do_scaling ? scales_0[0] : 1.0f; + is_training = pd->desc()->prop_kind == prop_kind::forward_training; + + lrn_K = lrn_desc->lrn_k; + lrn_N = lrn_desc->local_size; + lrn_alpha = lrn_desc->lrn_alpha; + lrn_beta = lrn_desc->lrn_beta; + + // Initialise lrn algorithm + CHECK(convert_alg_kind(pd->desc()->alg_kind, &lrn_mode)); + + // Set strides and dimensions + convert_dims(pd->src_md()->padded_dims, dims[src_idx], pd->ndims()); + convert_dims(pd->src_md()->format_desc.blocking.strides, + strides[src_idx], pd->ndims()); + + // Set datatype + CHECK(convert_data_type(pd->src_md(), &data_types[src_idx])); + + // Initialise tensor descriptor + CHECK(create_and_set_tensor_descriptor(&tensor_descs[src_idx], + data_types[src_idx], ndims, dims[src_idx], strides[src_idx])); + CHECK(create_and_set_lrn_descriptor()); + return status::success; + } + + virtual status_t create_and_set_lrn_descriptor() { + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnCreateLRNDescriptor, &lrn_desc)); + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetLRNDescriptor, lrn_desc, lrn_N, + lrn_alpha, lrn_beta, lrn_K)); + return status::success; + } + + status_t convert_alg_kind( + alg_kind_t alg_kind, cudnnLRNMode_t *cuda_alg_kind) { + if (alg_kind == alg_kind::lrn_across_channels) { + *cuda_alg_kind = cudnnLRNMode_t::CUDNN_LRN_CROSS_CHANNEL_DIM1; + } else { + return status::unimplemented; + } + return status::success; + } +}; + +struct cudnn_lrn_fwd_impl_t : public cudnn_lrn_impl_base_t { + + status_t init(const lrn_pd_t *pd) override { + CHECK(init_common(pd)); + + convert_dims(pd->dst_md()->padded_dims, dims[dst_idx], pd->ndims()); + convert_dims(pd->dst_md()->format_desc.blocking.strides, + strides[dst_idx], pd->ndims()); + + CHECK(convert_data_type(pd->dst_md(), &data_types[dst_idx])); + CHECK(create_and_set_tensor_descriptor(&tensor_descs[dst_idx], + data_types[dst_idx], ndims, dims[dst_idx], strides[dst_idx])); + return status::success; + } + + void execute(cudnnHandle_t handle, + const std::vector &args) const override { + CUDNN_EXECUTE_FUNC(cudnnLRNCrossChannelForward, handle, lrn_desc, + lrn_mode, &alpha, tensor_descs[src_idx], args[0], &beta, + tensor_descs[dst_idx], args[1]); + if (is_training) { + float alpha = 1.0f; + float beta = 0.0f; + cudnnAddTensor(handle, &alpha, tensor_descs[dst_idx], args[dst_idx], + &beta, tensor_descs[2], args[2]); + } + } +}; +struct cudnn_lrn_bwd_impl_t : public cudnn_lrn_impl_base_t { + + status_t init(const lrn_pd_t *pd) override { + CHECK(init_common(pd)); + + // Set dimensions + convert_dims( + pd->diff_dst_md()->padded_dims, dims[dst_idx], pd->ndims()); + convert_dims( + pd->diff_src_md()->padded_dims, dims[d_src_idx], pd->ndims()); + convert_dims( + pd->diff_dst_md()->padded_dims, dims[d_dst_idx], pd->ndims()); + + // Set strides + convert_dims(pd->diff_dst_md()->format_desc.blocking.strides, + strides[dst_idx], pd->ndims()); + convert_dims(pd->diff_src_md()->format_desc.blocking.strides, + strides[d_src_idx], pd->ndims()); + convert_dims(pd->diff_dst_md()->format_desc.blocking.strides, + strides[d_dst_idx], pd->ndims()); + + // Set datatypes + CHECK(convert_data_type(pd->diff_dst_md(), &data_types[dst_idx])); + CHECK(convert_data_type(pd->diff_src_md(), &data_types[d_src_idx])); + CHECK(convert_data_type(pd->diff_dst_md(), &data_types[d_dst_idx])); + + // Initialise tensor descriptors + CHECK(create_and_set_tensor_descriptor(&tensor_descs[dst_idx], + data_types[dst_idx], ndims, dims[dst_idx], strides[dst_idx])); + CHECK(create_and_set_tensor_descriptor(&tensor_descs[d_src_idx], + data_types[d_src_idx], ndims, dims[d_src_idx], + strides[d_src_idx])); + CHECK(create_and_set_tensor_descriptor(&tensor_descs[d_dst_idx], + data_types[d_dst_idx], ndims, dims[d_dst_idx], + strides[d_dst_idx])); + return status::success; + } + + void execute(cudnnHandle_t handle, + const std::vector &args) const override { + + CUDNN_EXECUTE_FUNC_V(cudnnLRNCrossChannelBackward, handle, lrn_desc, + lrn_mode, &alpha, tensor_descs[dst_idx], args[dst_idx], + tensor_descs[d_dst_idx], args[d_dst_idx], tensor_descs[src_idx], + args[src_idx], &beta, tensor_descs[d_src_idx], args[d_src_idx]); + } +}; + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/nvidia/cudnn_matmul.cpp b/src/gpu/nvidia/cudnn_matmul.cpp new file mode 100644 index 00000000000..26846bb6eba --- /dev/null +++ b/src/gpu/nvidia/cudnn_matmul.cpp @@ -0,0 +1,87 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "gpu/nvidia/cudnn_matmul.hpp" + +#include "common/c_types_map.hpp" +#include "common/dnnl_thread.hpp" +#include "common/type_helpers.hpp" + +#include "gpu/nvidia/cudnn_matmul_executor.hpp" +#include "gpu/nvidia/sycl_cuda_engine.hpp" +#include "gpu/nvidia/sycl_cuda_scoped_context.hpp" +#include "gpu/nvidia/sycl_cuda_stream.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +status_t cudnn_matmul_t::execute(const exec_ctx_t &ctx) const { + const bool with_bias = matmul_impl_->with_bias(); + const bool has_runtime_args = matmul_impl_->has_runtime_params(); + + const auto src_d = ctx.memory_mdw(DNNL_ARG_SRC, pd()->src_md()); + const auto weights_d = ctx.memory_mdw(DNNL_ARG_WEIGHTS, pd()->weights_md()); + const auto dst_d = ctx.memory_mdw(DNNL_ARG_DST, pd()->dst_md()); + const auto bias_d = with_bias + ? ctx.memory_mdw(DNNL_ARG_BIAS, pd()->weights_md(1)) + : nullptr; + + status_t status; + if (has_runtime_args) { + // Initialise all runtime parameters + status = matmul_impl_->init_parameters(src_d, weights_d, dst_d, bias_d); + if (status != status::success) return status; + } + + nvidia::sycl_cuda_stream_t *cuda_stream + = utils::downcast(ctx.stream()); + + if (!pd()->attr()->output_scales_.defined()) { + auto &buff = utils::downcast( + &CTX_IN_STORAGE(DNNL_ARG_ATTR_OUTPUT_SCALES)) + ->buffer(); + auto ev = copy(cuda_stream->queue(), buff, + reinterpret_cast(output_scale_)); + ev.wait(); + } + + const auto scratchpad_type = matmul_impl_->get_scratchpad_type(); + const auto scratchpad_size = matmul_impl_->with_scratchpad() + ? (dst_d.nelems() * types::data_type_size(scratchpad_type)) + : 0; + + status = executor_->execute(ctx, ctx.stream()->engine(), matmul_impl_, + *output_scale_, scratchpad_size); + + if (has_runtime_args) { + auto &evts = cuda_stream->get_deps(); + for (auto e : evts) { + e.wait(); + } + + matmul_impl_->cleanup(); + } + + return status; +} + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl diff --git a/src/gpu/nvidia/cudnn_matmul.hpp b/src/gpu/nvidia/cudnn_matmul.hpp new file mode 100644 index 00000000000..2d033d711b5 --- /dev/null +++ b/src/gpu/nvidia/cudnn_matmul.hpp @@ -0,0 +1,151 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_NVIDIA_CUDNN_MATMUL_HPP +#define GPU_NVIDIA_CUDNN_MATMUL_HPP + +#include + +#include "common/matmul_pd.hpp" +#include "common/primitive.hpp" + +#include "gpu/nvidia/cudnn_matmul_executor.hpp" +#include "gpu/nvidia/cudnn_matmul_impl.hpp" +#include "gpu/nvidia/sycl_cuda_utils.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +struct cudnn_matmul_t : public primitive_t { + struct pd_t : public matmul_pd_t { + using matmul_pd_t::matmul_pd_t; + + DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_matmul_t); + + status_t init(engine_t *) { + using namespace data_type; + using smask_t = primitive_attr_t::skip_mask_t; + + data_type_t src_dt = src_md()->data_type; + data_type_t dst_dt = dst_md()->data_type; + data_type_t wei_dt = weights_md(0)->data_type; + data_type_t bia_dt + = with_bias() ? weights_md(1)->data_type : data_type::f32; + + bool f32_case = utils::everyone_is(f32, src_dt, wei_dt, dst_dt); + bool f16_case = utils::everyone_is(f16, src_dt, wei_dt, dst_dt); + bool s8_case = utils::everyone_is(s8, src_dt, wei_dt) + && utils::one_of(dst_dt, s8, f32); + + bool ok = attr()->has_default_values( + smask_t::oscale_runtime | smask_t::post_ops) + && attr_oscale_ok() && attr_post_ops_ok() + && set_default_formats() + && (f32_case || f16_case || s8_case) + && IMPLICATION(with_bias(), + (IMPLICATION(f32_case, utils::one_of(bia_dt, f32)) + && IMPLICATION(f16_case, + utils::one_of(bia_dt, f16, f32)) + && IMPLICATION(s8_case, + utils::one_of(bia_dt, s8, f32)))); + + if (!ok) return status::unimplemented; + return status::success; + } + + private: + bool attr_oscale_ok() const { + const auto &oscale = attr()->output_scales_; + return oscale.mask_ == 0 || oscale.mask_ == (1 << (batched() + 1)); + } + + bool attr_post_ops_ok() const { + using namespace primitive_kind; + const auto &p = attr()->post_ops_; + switch (p.len()) { + case 0: return true; + case 1: return p.contain(sum, 0) || p.contain(eltwise, 0); + case 2: return p.contain(sum, 0) && p.contain(eltwise, 1); + default: return false; + } + } + }; + + cudnn_matmul_t(const pd_t *apd) : primitive_t(apd) {} + + status_t init(engine_t *engine) override { + matmul_impl_.reset(new cudnn_matmul_impl_t()); + const auto status + = matmul_impl_->init((matmul_pd_t *)primitive_t::pd().get()); + + if (pd()->attr()->output_scales_.defined()) { + output_scale_ = pd()->attr()->output_scales_.scales_; + } else { + // Only single-element scale is supported + output_scale_ = new float; + } + + const bool with_bias = matmul_impl_->with_bias(); + const bool has_runtime_args = matmul_impl_->has_runtime_params(); + const bool with_scratchpad = matmul_impl_->with_scratchpad(); + + if (with_scratchpad && has_runtime_args && with_bias) { + executor_.reset(new cudnn_matmul_scratch_runtime_args_bias_exec_t); + } else if (with_scratchpad && has_runtime_args) { + executor_.reset(new cudnn_matmul_runtime_args_scratch_exec_t); + } else if (has_runtime_args && with_bias) { + executor_.reset(new cudnn_matmul_runtime_args_bias_exec_t); + } else if (has_runtime_args) { + executor_.reset(new cudnn_matmul_runtime_args_exec_t); + } else if (with_bias && with_scratchpad) { + executor_.reset(new cudnn_matmul_bias_scratch_exec_t); + } else if (with_scratchpad) { + executor_.reset(new cudnn_matmul_scratch_exec_t); + } else if (with_bias) { + executor_.reset(new cudnn_matmul_bias_exec_t); + } else if (!with_scratchpad && !has_runtime_args && !with_bias) { + executor_.reset(new cudnn_matmul_exec_t); + } else { + return status::unimplemented; + } + + return status; + } + + status_t execute(const exec_ctx_t &ctx) const override; + + virtual ~cudnn_matmul_t() { + if (!pd()->attr()->output_scales_.defined()) { delete output_scale_; } + } + + std::shared_ptr matmul_impl_; + std::shared_ptr executor_; + +private: + const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } + + float *output_scale_; +}; + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/nvidia/cudnn_matmul_executor.hpp b/src/gpu/nvidia/cudnn_matmul_executor.hpp new file mode 100644 index 00000000000..fe403c42f87 --- /dev/null +++ b/src/gpu/nvidia/cudnn_matmul_executor.hpp @@ -0,0 +1,300 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_NVIDIA_CUDNN_MATMUL_EXECUTOR_HPP +#define GPU_NVIDIA_CUDNN_MATMUL_EXECUTOR_HPP + +#include "gpu/nvidia/cudnn_matmul.hpp" +#include "gpu/nvidia/cudnn_matmul_impl.hpp" +#include "gpu/nvidia/sycl_cuda_engine.hpp" +#include "gpu/nvidia/sycl_cuda_scoped_context.hpp" +#include "gpu/nvidia/sycl_cuda_stream.hpp" + +#include + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +struct cudnn_matmul_exec_base_t { + virtual status_t execute(const exec_ctx_t &ctx, engine_t *engine, + const std::shared_ptr matmul_impl_, + float output_scale, std::size_t scratchpad_size) + = 0; + +protected: + template + void interop_task(std::shared_ptr matmul_impl_, + engine_t *engine, cl::sycl::handler &cgh, + nvidia::sycl_cuda_stream_t *cuda_stream, read_acc_t weights_acc, + read_acc_t src_acc, write_acc_t dst_acc, bias_acc_t bias_acc, + scratch_acc_t scratch_acc, float output_scale) { + + cgh.interop_task([=](const cl::sycl::interop_handler &ih) { + auto &sycl_engine = *utils::downcast( + cuda_stream->engine()); + auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine); + auto cublas_handle = cuda_stream->get_cublas_handle(); + auto cudnn_handle = cuda_stream->get_cudnn_handle(); + + auto scratch = maybe_cast_to_ptr(scratch_acc, sc, ih); + auto bias = maybe_cast_to_ptr(bias_acc, sc, ih); + auto weights = sc.memory(ih, weights_acc); + auto src = sc.memory(ih, src_acc); + auto dst = sc.memory(ih, dst_acc); + + matmul_impl_->execute(cublas_handle, cudnn_handle, weights, src, + dst, bias, scratch, output_scale); + }); + } + + template + void *maybe_cast_to_ptr(cl::sycl::accessor acc, sc_t &sc, + const cl::sycl::interop_handler &ih) const { + return sc.template memory(ih, acc); + } + + template + std::nullptr_t maybe_cast_to_ptr(std::nullptr_t acc, sc_t &, + const cl::sycl::interop_handler &ih) const { + return acc; + } +}; + +struct cudnn_matmul_scratch_runtime_args_base_exec_t + : public cudnn_matmul_exec_base_t { + virtual status_t execute(const exec_ctx_t &ctx, engine_t *engine, + const std::shared_ptr matmul_impl_, + float output_scale, std::size_t scratchpad_size) + = 0; + +protected: + void init_scratch_buffer(std::size_t scratch_size) { + if (scratch_size > 0) { + scratch_buff_.reset(new cl::sycl::buffer(scratch_size)); + } + } + + std::shared_ptr> scratch_buff_ {nullptr}; +}; + +struct cudnn_matmul_scratch_runtime_args_bias_exec_t + : public cudnn_matmul_scratch_runtime_args_base_exec_t { + status_t execute(const exec_ctx_t &ctx, engine_t *engine, + const std::shared_ptr matmul_impl_, + float output_scale, std::size_t scratchpad_size) override { + + nvidia::sycl_cuda_stream_t *cuda_stream + = utils::downcast(ctx.stream()); + + init_scratch_buffer(scratchpad_size); + + return cuda_stream->interop_task([=](cl::sycl::handler &cgh) { + auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC); + auto wt_acc = CTX_IN_ACCESSOR(DNNL_ARG_WEIGHTS); + auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST); + auto bias_acc = CTX_IN_ACCESSOR(DNNL_ARG_BIAS); + + auto scratch_acc + = scratch_buff_ + ->get_access( + cgh); + + interop_task(matmul_impl_, engine, cgh, cuda_stream, wt_acc, + src_acc, dst_acc, bias_acc, scratch_acc, output_scale); + }); + } +}; + +struct cudnn_matmul_runtime_args_scratch_exec_t + : public cudnn_matmul_scratch_runtime_args_base_exec_t { + status_t execute(const exec_ctx_t &ctx, engine_t *engine, + const std::shared_ptr matmul_impl_, + float output_scale, std::size_t scratchpad_size) override { + + nvidia::sycl_cuda_stream_t *cuda_stream + = utils::downcast(ctx.stream()); + + init_scratch_buffer(scratchpad_size); + + return cuda_stream->interop_task([=](cl::sycl::handler &cgh) { + auto wt_acc = CTX_IN_ACCESSOR(DNNL_ARG_WEIGHTS); + auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC); + auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST); + + auto scratch_acc + = scratch_buff_ + ->get_access( + cgh); + + interop_task(matmul_impl_, engine, cgh, cuda_stream, wt_acc, + src_acc, dst_acc, nullptr, scratch_acc, output_scale); + }); + } +}; + +struct cudnn_matmul_runtime_args_bias_exec_t : public cudnn_matmul_exec_base_t { + status_t execute(const exec_ctx_t &ctx, engine_t *engine, + const std::shared_ptr matmul_impl_, + float output_scale, std::size_t scratchpad_size) override { + + nvidia::sycl_cuda_stream_t *cuda_stream + = utils::downcast(ctx.stream()); + + return cuda_stream->interop_task([=](cl::sycl::handler &cgh) { + auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC); + auto wt_acc = CTX_IN_ACCESSOR(DNNL_ARG_WEIGHTS); + auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST); + auto bias_acc = CTX_IN_ACCESSOR(DNNL_ARG_BIAS); + + interop_task(matmul_impl_, engine, cgh, cuda_stream, wt_acc, + src_acc, dst_acc, bias_acc, nullptr, output_scale); + }); + } +}; + +struct cudnn_matmul_runtime_args_exec_t : public cudnn_matmul_exec_base_t { + status_t execute(const exec_ctx_t &ctx, engine_t *engine, + const std::shared_ptr matmul_impl_, + float output_scale, std::size_t scratchpad_size) override { + + nvidia::sycl_cuda_stream_t *cuda_stream + = utils::downcast(ctx.stream()); + + return cuda_stream->interop_task([=](cl::sycl::handler &cgh) { + auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC); + auto wt_acc = CTX_IN_ACCESSOR(DNNL_ARG_WEIGHTS); + auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST); + + interop_task(matmul_impl_, engine, cgh, cuda_stream, wt_acc, + src_acc, dst_acc, nullptr, nullptr, output_scale); + }); + } +}; + +struct cudnn_matmul_bias_scratch_exec_t : public cudnn_matmul_exec_base_t { + status_t execute(const exec_ctx_t &ctx, engine_t *engine, + const std::shared_ptr matmul_impl_, + float output_scale, std::size_t scratchpad_size) override { + + nvidia::sycl_cuda_stream_t *cuda_stream + = utils::downcast(ctx.stream()); + + return cuda_stream->interop_task([=](cl::sycl::handler &cgh) { + auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC); + auto wt_acc = CTX_IN_ACCESSOR(DNNL_ARG_WEIGHTS); + auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST); + auto bias_acc = CTX_IN_ACCESSOR(DNNL_ARG_BIAS); + + using read_write_acc_t = cl::sycl::accessor; + + auto scratch_acc = read_write_acc_t( + utils::downcast( + ctx.get_scratchpad_grantor() + .get_memory_storage(memory_tracking::names:: + key_matmul_dst_in_acc_dt) + .get()) + ->buffer() + .get_access( + cgh)); + + interop_task(matmul_impl_, engine, cgh, cuda_stream, wt_acc, + src_acc, dst_acc, bias_acc, scratch_acc, output_scale); + }); + } +}; + +struct cudnn_matmul_scratch_exec_t : public cudnn_matmul_exec_base_t { + status_t execute(const exec_ctx_t &ctx, engine_t *engine, + const std::shared_ptr matmul_impl_, + float output_scale, std::size_t scratchpad_size) override { + + nvidia::sycl_cuda_stream_t *cuda_stream + = utils::downcast(ctx.stream()); + + return cuda_stream->interop_task([=](cl::sycl::handler &cgh) { + auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC); + auto wt_acc = CTX_IN_ACCESSOR(DNNL_ARG_WEIGHTS); + auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST); + + using read_write_acc_t = cl::sycl::accessor; + + auto scratch_acc = read_write_acc_t( + utils::downcast( + ctx.get_scratchpad_grantor() + .get_memory_storage(memory_tracking::names:: + key_matmul_dst_in_acc_dt) + .get()) + ->buffer() + .get_access( + cgh)); + + interop_task(matmul_impl_, engine, cgh, cuda_stream, wt_acc, + src_acc, dst_acc, nullptr, scratch_acc, output_scale); + }); + } +}; + +struct cudnn_matmul_bias_exec_t : public cudnn_matmul_exec_base_t { + status_t execute(const exec_ctx_t &ctx, engine_t *engine, + const std::shared_ptr matmul_impl_, + float output_scale, std::size_t scratchpad_size) override { + + nvidia::sycl_cuda_stream_t *cuda_stream + = utils::downcast(ctx.stream()); + + return cuda_stream->interop_task([=](cl::sycl::handler &cgh) { + auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC); + auto wt_acc = CTX_IN_ACCESSOR(DNNL_ARG_WEIGHTS); + auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST); + auto bias_acc = CTX_IN_ACCESSOR(DNNL_ARG_BIAS); + + interop_task(matmul_impl_, engine, cgh, cuda_stream, wt_acc, + src_acc, dst_acc, bias_acc, nullptr, output_scale); + }); + } +}; + +struct cudnn_matmul_exec_t : public cudnn_matmul_exec_base_t { + status_t execute(const exec_ctx_t &ctx, engine_t *engine, + const std::shared_ptr matmul_impl_, + float output_scale, std::size_t scratchpad_size) override { + + nvidia::sycl_cuda_stream_t *cuda_stream + = utils::downcast(ctx.stream()); + + return cuda_stream->interop_task([=](cl::sycl::handler &cgh) { + auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC); + auto wt_acc = CTX_IN_ACCESSOR(DNNL_ARG_WEIGHTS); + auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST); + + interop_task(matmul_impl_, engine, cgh, cuda_stream, wt_acc, + src_acc, dst_acc, nullptr, nullptr, output_scale); + }); + } +}; + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/nvidia/cudnn_matmul_impl.hpp b/src/gpu/nvidia/cudnn_matmul_impl.hpp new file mode 100644 index 00000000000..04440f44892 --- /dev/null +++ b/src/gpu/nvidia/cudnn_matmul_impl.hpp @@ -0,0 +1,403 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_NVIDIA_CUDNN_MATMUL_IMPL_HPP +#define GPU_NVIDIA_CUDNN_MATMUL_IMPL_HPP + +#include "cudnn.h" + +#include "gpu/nvidia/sycl_cuda_engine.hpp" +#include "gpu/nvidia/sycl_cuda_utils.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +struct cudnn_matmul_impl_t { + + bool with_eltwise(int position, const matmul_pd_t *pd) const { + return pd->attr()->post_ops_.contain(primitive_kind::eltwise, position); + } + + float eltwise_alpha(const matmul_pd_t *pd) const { + int eltwise_idx_ = pd->attr()->post_ops_.find(primitive_kind::eltwise); + return with_eltwise(0, pd) || with_eltwise(1, pd) + ? pd->attr()->post_ops_.entry_[eltwise_idx_].eltwise.alpha + : 1.0f; + } + + float eltwise_beta(const matmul_pd_t *pd) const { + int eltwise_idx_ = pd->attr()->post_ops_.find(primitive_kind::eltwise); + return with_eltwise(0, pd) || with_eltwise(1, pd) + ? pd->attr()->post_ops_.entry_[eltwise_idx_].eltwise.beta + : 0.0f; + } + + alg_kind_t eltwise_algo(const matmul_pd_t *pd) const { + int eltwise_idx_ = pd->attr()->post_ops_.find(primitive_kind::eltwise); + return with_eltwise(0, pd) || with_eltwise(1, pd) + ? pd->attr()->post_ops_.entry_[eltwise_idx_].eltwise.alg + : dnnl_alg_kind_undef; + } + + bool with_sum(const matmul_pd_t *pd) const { + return pd->attr()->post_ops_.contain(primitive_kind::sum, 0) + || pd->attr()->post_ops_.contain(primitive_kind::sum, 1); + } + + // Returns scaling factor for post-ops=sum operation + float sum_scale(const matmul_pd_t *pd) const { + int sum_idx_ = pd->attr()->post_ops_.find(primitive_kind::sum); + return pd->attr()->post_ops_.entry_[sum_idx_].sum.scale; + } + + // creates operation descriptor based on the elemen-wise operation specified + status_t create_and_set_op_descriptor(const matmul_pd_t *pd) { + CHECK(CUDNN_EXECUTE_FUNC_S( + cudnnCreateActivationDescriptor, &act_desc_)); + + cudnnActivationMode_t mode; + + switch (eltwise_algo(pd)) { + case alg_kind::eltwise_relu: + mode = cudnnActivationMode_t::CUDNN_ACTIVATION_RELU; + break; + case alg_kind::eltwise_bounded_relu: + mode = cudnnActivationMode_t::CUDNN_ACTIVATION_CLIPPED_RELU; + break; + case alg_kind::eltwise_tanh: + mode = cudnnActivationMode_t::CUDNN_ACTIVATION_TANH; + break; + case alg_kind::eltwise_elu: + mode = cudnnActivationMode_t::CUDNN_ACTIVATION_ELU; + break; + case alg_kind::eltwise_logistic: + mode = cudnnActivationMode_t::CUDNN_ACTIVATION_SIGMOID; + break; + default: return status::unimplemented; + } + + // NaNs by default are propagated in oneDNN, although the forward + // convolution routine does not support this. + auto propagate_nan = cudnnNanPropagation_t::CUDNN_NOT_PROPAGATE_NAN; + + // For ReLU, a ceiling of 0 means no limit. + double ceiling = eltwise_alpha(pd); + + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetActivationDescriptor, act_desc_, + mode, propagate_nan, ceiling)); + + return status::success; + } + + status_t init(matmul_pd_t *pd) { + CHECK(get_cublas_data_type(pd->src_md()->data_type, src_type_)); + CHECK(get_cublas_data_type(pd->weights_md()->data_type, weights_type_)); + + isbatched_ = pd->batched(); + + memory_desc_wrapper src_d = memory_desc_wrapper(pd->src_md()); + memory_desc_wrapper weights_d = memory_desc_wrapper(pd->weights_md()); + memory_desc_wrapper dst_d = memory_desc_wrapper(pd->dst_md()); + + with_bias_ = pd->with_bias(); + if ((with_bias_) + && (pd->weights_md(1)->data_type != pd->dst_md()->data_type)) { + // When datatype of bias is different from the dst, + // we need to reorder the output. + bias_dt_mismatch_ = true; + reorder_required_ = true; + CHECK(get_cublas_data_type( + pd->weights_md(1)->data_type, dst_type_)); + } else { + CHECK(get_cublas_data_type(pd->dst_md()->data_type, dst_type_)); + } + + // cuBLAS only supports s8s8f32 configuration. + // Hence, one final reorder is required if the cfg = s8s8s8 + if (dst_type_ == cudaDataType_t::CUDA_R_8I) { + reorder_required_ = true; + dst_type_ = cudaDataType_t::CUDA_R_32F; + } + + if (with_eltwise(0, pd) || with_eltwise(1, pd)) { + with_eltwise_ = true; + create_and_set_op_descriptor(pd); + } + + // Set parameter when post-op sum is specified + if (with_sum(pd)) { post_op_sum_ = sum_scale(pd); } + + has_runtime_params_ = src_d.has_runtime_dims_or_strides() + || dst_d.has_runtime_dims_or_strides() + || weights_d.has_runtime_dims_or_strides(); + + if (!has_runtime_params_) { + // Initialise all gemm parameters if there are no runtime parameters + init_parameters(src_d, weights_d, dst_d, + memory_desc_wrapper(pd->weights_md(1))); + if (with_scratchpad()) { book_scratchpad(pd, dst_d.nelems()); } + } + + if (reorder_required_ || bias_dt_mismatch_) { with_scratchpad_ = true; } + + return status::success; + } + + status_t book_scratchpad(matmul_pd_t *pd, dim_t num_elems) { + if (has_runtime_params_) { return status::unimplemented; } + // This case should only be called when no runtime parameters are + // specified + pd->scratchpad_registry().registrar().book( + memory_tracking::names::key_matmul_dst_in_acc_dt, num_elems, + types::data_type_size(get_scratchpad_type())); + return status::success; + } + + bool isbatched() { return isbatched_; } + bool with_bias() { return with_bias_; } + bool with_scratchpad() { return with_scratchpad_; } + bool has_runtime_params() { return has_runtime_params_; } + + dnnl_data_type_t get_scratchpad_type() { return scratchpad_type_; } + + void convert_dims_matmul( + const dnnl_dim_t *dims, int *new_dims, int n_dims) { + // Moving the dimensions because cudnnAddTensor doesn't work when + // bia_mask=1 + if (n_dims == 3) { return convert_dims(dims, new_dims, n_dims); } + new_dims[0] = 1; + for (size_t i = 0; i < n_dims; i++) { + new_dims[i + 1] = static_cast(dims[i]); + } + for (size_t i = n_dims; i < 4; i++) { + new_dims[i + 1] = 1; + } + } + + status_t init_gemm_parameters(const memory_desc_wrapper src_d, + const memory_desc_wrapper weights_d, + const memory_desc_wrapper dst_d) { + const auto &dst_bd = dst_d.blocking_desc(); + + if (isbatched_) { batch_count_ = dst_d.dims()[0]; } + + const dim_t M = dst_d.dims()[isbatched_ + 1]; + const dim_t N = dst_d.dims()[isbatched_ + 0]; + const dim_t K = src_d.dims()[isbatched_ + 1]; + + M_ = (int)M; + N_ = (int)N; + K_ = (int)K; + + const auto &src_strides = &src_d.blocking_desc().strides[isbatched_]; + const auto &weights_strides + = &weights_d.blocking_desc().strides[isbatched_]; + + // A matrix is the weights + transA_ = weights_strides[1] == 1 + && weights_d.dims()[isbatched_ + 0] > 1 + ? cublasOperation_t::CUBLAS_OP_N + : cublasOperation_t::CUBLAS_OP_T; + // B matrix is the src + transB_ = src_strides[1] == 1 && src_d.dims()[isbatched_ + 0] > 1 + ? cublasOperation_t::CUBLAS_OP_N + : cublasOperation_t::CUBLAS_OP_T; + + lda_ = (int) + weights_strides[transA_ == cublasOperation_t::CUBLAS_OP_N ? 0 + : 1]; + ldb_ = (int) + src_strides[transB_ == cublasOperation_t::CUBLAS_OP_N ? 0 : 1]; + ldc_ = (int)dst_bd.strides[isbatched_ + 0]; + + if (isbatched_) { + // These parameters are required for cublasGemmStridedBatchedEx() + stride_a_ = (transA_ == cublasOperation_t::CUBLAS_OP_N) ? lda_ * K_ + : lda_ * M_; + stride_b_ = (transB_ == cublasOperation_t::CUBLAS_OP_N) ? ldb_ * N_ + : ldb_ * K_; + stride_c_ = ldc_ * N_; + } + + return status::success; + } + + status_t init_parameters(const memory_desc_wrapper src_d, + const memory_desc_wrapper weights_d, + const memory_desc_wrapper dst_d, const memory_desc_wrapper bias_d) { + // Matmul supports runtime paramters for dimensions and scales. + // We need to initialize them in the execute function. + init_gemm_parameters(src_d, weights_d, dst_d); + + if (with_bias_ || reorder_required_ || with_eltwise_) { + // Initialise cuDNN descriptors + cudnnDataType_t data_types[NUM_IO]; + int ndims = dst_d.ndims() < 4 ? 4 : dst_d.ndims(); + int dims[NUM_IO][DNNL_MAX_NDIMS]; + int strides[NUM_IO][DNNL_MAX_NDIMS]; + + convert_dims_matmul(dst_d.dims(), dims[dst], dst_d.ndims()); + CHECK(convert_data_type(dst_d.md_, &data_types[dst], false)); + convert_dims_matmul( + dst_d.blocking_desc().strides, strides[dst], dst_d.ndims()); + CHECK(create_and_set_tensor_descriptor(&tensor_descs_[dst], + data_types[dst], ndims, dims[dst], strides[dst])); + + if (reorder_required_ && !bias_dt_mismatch_) { + // If reorder is required, we need to create a scratchpad memory + // to store the intermediate result + with_scratchpad_ = true; + scratchpad_type_ = data_type::f32; + CHECK(create_and_set_tensor_descriptor(&temp_mem_desc_, + cudnnDataType_t::CUDNN_DATA_FLOAT, ndims, dims[dst], + strides[dst])); + } + + if (with_bias_) { + // Create bias and destination tensor descriptors + convert_dims_matmul(bias_d.dims(), dims[bias], bias_d.ndims()); + convert_dims_matmul(bias_d.blocking_desc().strides, + strides[bias], bias_d.ndims()); + CHECK(convert_data_type(bias_d.md_, &data_types[bias], false)); + CHECK(create_and_set_tensor_descriptor(&tensor_descs_[bias], + data_types[bias], ndims, dims[bias], strides[bias])); + if (bias_dt_mismatch_) { + with_scratchpad_ = true; + scratchpad_type_ = bias_d.data_type(); + CHECK(create_and_set_tensor_descriptor(&temp_mem_desc_, + data_types[bias], ndims, dims[dst], strides[dst])); + } + } + } + return status::success; + } + + void execute(cublasHandle_t cublas_handle, cudnnHandle_t cudnn_handle, + void *a, void *b, void *c, void *bias, void *scratch, + const float scales) { + float gemm_beta = 0; + if (!bias_dt_mismatch_ && !reorder_required_) { + // Case where no reorder is required, scratchpad points to dst (c) + scratch = c; + temp_mem_desc_ = tensor_descs_[io::dst]; + gemm_beta = post_op_sum_; + } + if (isbatched_) { + // Calls cublasGemmStridedBatchedEx() + CUBLAS_EXECUTE_FUNC(cublasGemmStridedBatchedEx, cublas_handle, + transA_, transB_, M_, N_, K_, &scales, a, weights_type_, + lda_, stride_a_, b, src_type_, ldb_, stride_b_, &gemm_beta, + scratch, dst_type_, ldc_, stride_c_, batch_count_, + acc_type_, gemm_algo_); + } else { + // Calls cublasGemmEx() + CUBLAS_EXECUTE_FUNC(cublasGemmEx, cublas_handle, transA_, transB_, + M_, N_, K_, &scales, a, weights_type_, lda_, b, src_type_, + ldb_, &gemm_beta, scratch, dst_type_, ldc_, acc_type_, + gemm_algo_); + } + if (with_bias_) { + // When bias is specified call cudnnAddTensor() + float bias_beta = 1; + CUDNN_EXECUTE_FUNC(cudnnAddTensor, cudnn_handle, &scales, + tensor_descs_[io::bias], bias, &bias_beta, temp_mem_desc_, + scratch); + } + if (with_eltwise_) { + // Perform elementwise operation if specified + float alpha = 1; + float beta = 0; + CUDNN_EXECUTE_FUNC(cudnnActivationForward, cudnn_handle, act_desc_, + &alpha, temp_mem_desc_, scratch, &beta, temp_mem_desc_, + scratch); + } + if (reorder_required_) { + // Reorder from scratchpad to destination if required + float reorder_alpha = 1, reorder_beta = 0; + CUDNN_EXECUTE_FUNC(cudnnTransformTensor, cudnn_handle, + &reorder_alpha, temp_mem_desc_, scratch, &post_op_sum_, + tensor_descs_[io::dst], c); + } + } + + ~cudnn_matmul_impl_t() { cleanup(); } + + void cleanup() { + if (act_desc_) { + CUDNN_EXECUTE_FUNC_V(cudnnDestroyActivationDescriptor, act_desc_); + act_desc_ = nullptr; + } + if ((reorder_required_ && !bias_dt_mismatch_) + || (with_bias_ && bias_dt_mismatch_) && temp_mem_desc_) { + CUDNN_EXECUTE_FUNC_V(cudnnDestroyTensorDescriptor, temp_mem_desc_); + temp_mem_desc_ = nullptr; + } + for (size_t i = 0; i < NUM_IO; i++) { + if (tensor_descs_[i]) { + CUDNN_EXECUTE_FUNC_V( + cudnnDestroyTensorDescriptor, tensor_descs_[i]); + tensor_descs_[i] = nullptr; + } + } + } + +private: + status_t get_cublas_data_type( + dnnl_data_type_t data_type, cudaDataType_t &blas_dt) { + switch (data_type) { + case dnnl_data_type_t::dnnl_f32: + blas_dt = CUDA_R_32F; + return status::success; + case dnnl_data_type_t::dnnl_f16: + blas_dt = CUDA_R_16F; + return status::success; + case dnnl_data_type_t::dnnl_s8: + blas_dt = CUDA_R_8I; + return status::success; + default: return status::unimplemented; + } + return status::unimplemented; + } + cublasOperation_t transA_; + cublasOperation_t transB_; + int M_, N_, K_; + int lda_, ldb_, ldc_; + long long int stride_a_, stride_b_, stride_c_; + bool isbatched_ = false, with_bias_ = false, bias_dt_mismatch_ = false; + bool reorder_required_ = false, with_eltwise_ = false; + bool with_scratchpad_ = false, has_runtime_params_ = false; + dnnl_data_type_t scratchpad_type_; + cudaDataType_t src_type_, weights_type_, dst_type_; + cudaDataType_t acc_type_ = cudaDataType_t::CUDA_R_32F, bias_type_; + cublasGemmAlgo_t gemm_algo_ + = cublasGemmAlgo_t::CUBLAS_GEMM_DEFAULT_TENSOR_OP; + int batch_count_; + enum io { bias = 0, dst, NUM_IO }; + cudnnTensorDescriptor_t tensor_descs_[NUM_IO] = {}, + temp_mem_desc_ = nullptr; + cudnnActivationDescriptor_t act_desc_ = nullptr; + float post_op_sum_; +}; + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/nvidia/cudnn_pooling.cpp b/src/gpu/nvidia/cudnn_pooling.cpp new file mode 100644 index 00000000000..03b7f38df0e --- /dev/null +++ b/src/gpu/nvidia/cudnn_pooling.cpp @@ -0,0 +1,157 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "gpu/nvidia/cudnn_pooling.hpp" +#include "gpu/nvidia/sycl_cuda_scoped_context.hpp" +#include "gpu/nvidia/sycl_cuda_stream.hpp" +#include "sycl/sycl_buffer_memory_storage.hpp" + +#include + +#include "common/nstl.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +status_t cudnn_pooling_fwd_t::execute(const exec_ctx_t &ctx) const { + // If dst is empty, do nothing + memory_desc_wrapper dst_wrap(pd()->dst_md()); + if (dst_wrap.size() == 0) return status::success; + + nvidia::sycl_cuda_stream_t *cuda_stream + = utils::downcast(ctx.stream()); + + bool is_training = pd()->desc()->prop_kind == prop_kind::forward_training; + auto wkspace_st = is_training + ? ctx.output(DNNL_ARG_WORKSPACE)->memory_storage() + : &memory_storage_t::empty_storage(); + + memory_desc_wrapper src_wrap(pd()->src_md()); + auto dst_offset_bytes = src_wrap.nelems() * src_wrap.data_type_size(); + + // If src is empty and dst is not, fill dst with + // numeric_limits
::lowest() to match the other backends' behaviour + if (src_wrap.size() == 0 && dst_wrap.size() != 0) { + return cuda_stream->interop_task([&](cl::sycl::handler &cgh) { + auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST); + + cgh.interop_task([=](const cl::sycl::interop_handler &ih) { + auto &sycl_engine = *utils::downcast( + cuda_stream->engine()); + auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine); + + auto dst = sc.memory(ih, dst_acc); + + if (dst_wrap.data_type() == data_type_t::dnnl_f32) { + auto val = nstl::numeric_limits::lowest(); + cuMemsetD32Async(reinterpret_cast(dst), + reinterpret_cast(val), dst_wrap.nelems(), + cuda_stream->get_underlying_stream()); + } else if (dst_wrap.data_type() == data_type_t::dnnl_f16) { + float16_t val = nstl::numeric_limits::lowest(); + cuMemsetD16Async(reinterpret_cast(dst), + reinterpret_cast(val), + dst_wrap.nelems(), + cuda_stream->get_underlying_stream()); + } else if (dst_wrap.data_type() == data_type_t::dnnl_s8) { + auto val = nstl::numeric_limits::lowest(); + cuMemsetD8Async(reinterpret_cast(dst), + reinterpret_cast(val), + dst_wrap.nelems(), + cuda_stream->get_underlying_stream()); + } + }); + }); + } + + return cuda_stream->interop_task([&](cl::sycl::handler &cgh) { + auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC); + auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST); + + std::shared_ptr< + cl::sycl::accessor> + wkspace_acc; + if (!wkspace_st->is_null()) { + wkspace_acc = std::make_shared>( + utils::downcast( + wkspace_st) + ->buffer() + .template get_access( + cgh)); + } + + cgh.interop_task([=](const cl::sycl::interop_handler &ih) { + auto &sycl_engine = *utils::downcast( + cuda_stream->engine()); + auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine); + auto handle = cuda_stream->get_cudnn_handle(); + + auto x = sc.memory(ih, src_acc); + auto y = sc.memory(ih, dst_acc); + uint8_t *ws_x = nullptr, *ws_y = nullptr; + if (!wkspace_st->is_null()) { + ws_x = sc.memory(ih, *wkspace_acc); + ws_y = ws_x + dst_offset_bytes; + } + + pd()->pooling_impl_->execute(handle, x, y, ws_x, ws_y); + }); + }); +} + +status_t cudnn_pooling_bwd_t::execute(const exec_ctx_t &ctx) const { + if (has_zero_dims(pd()->diff_src_md()->dims, pd()->diff_src_md()->ndims) + || has_zero_dims( + pd()->diff_dst_md()->dims, pd()->diff_dst_md()->ndims)) { + return status::success; + } + + memory_desc_wrapper wrap(pd()->diff_src_md()); + if (wrap.size() == 0) { return status::success; } + const auto dst_offset_bytes = wrap.size(); + + nvidia::sycl_cuda_stream_t *cuda_stream + = utils::downcast(ctx.stream()); + + return cuda_stream->interop_task([&](cl::sycl::handler &cgh) { + auto diff_src_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_SRC); + auto diff_dst_acc = CTX_IN_ACCESSOR(DNNL_ARG_DIFF_DST); + auto wkspace_acc = CTX_IN_ACCESSOR(DNNL_ARG_WORKSPACE); + + cgh.interop_task([=](const cl::sycl::interop_handler &ih) { + auto &sycl_engine = *utils::downcast( + cuda_stream->engine()); + auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine); + auto handle = cuda_stream->get_cudnn_handle(); + + auto dx = sc.memory(ih, diff_src_acc); + auto dy = sc.memory(ih, diff_dst_acc); + auto ws_x = sc.memory(ih, wkspace_acc); + auto ws_y = ws_x + dst_offset_bytes; + + pd()->pooling_impl_->execute(handle, dx, dy, ws_x, ws_y); + }); + }); +} + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl diff --git a/src/gpu/nvidia/cudnn_pooling.hpp b/src/gpu/nvidia/cudnn_pooling.hpp new file mode 100644 index 00000000000..ae12c38d4e5 --- /dev/null +++ b/src/gpu/nvidia/cudnn_pooling.hpp @@ -0,0 +1,200 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_NVIDIA_CUDNN_POOLING_HPP +#define GPU_NVIDIA_CUDNN_POOLING_HPP + +#include "common/c_types_map.hpp" +#include "common/pooling_pd.hpp" +#include "common/primitive.hpp" +#include "common/type_helpers.hpp" +#include "gpu/nvidia/cudnn_pooling_impl.hpp" +#include "gpu/nvidia/sycl_cuda_engine.hpp" +#include "gpu/nvidia/sycl_cuda_stream.hpp" +#include "gpu/nvidia/sycl_cuda_utils.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +struct cudnn_pooling_common_t { + template + void init_ws(const pd_t *pd, memory_desc_t &ws_md) { + bool is_fwd = pd->is_fwd(); + memory_desc_wrapper src_wrap(is_fwd ? pd->src_md() : pd->diff_src_md()); + memory_desc_wrapper dst_wrap(is_fwd ? pd->dst_md() : pd->diff_dst_md()); + + const auto src_size = src_wrap.nelems(); + const auto dst_size = dst_wrap.nelems(); + const dims_t ws_size = {(dim_t)(src_size + dst_size)}; + + dnnl_memory_desc_init_by_tag( + &ws_md, 1, ws_size, src_wrap.data_type(), format_tag::x); + } + + status_t init_mem_by_tag(format_tag_t tag, memory_desc_t &md) { + if (tag == format_tag::undef) { return status::unimplemented; } + CHECK(memory_desc_init_by_tag(md, tag)); + return status::success; + } + + format_tag_t get_tag(const memory_desc_t &md) const { + using namespace format_tag; + auto tag = memory_desc_matches_one_of_tag(md, ab, abc, abcd, + abcde, // NCHW derivatives + ba, bca, bcda, bcdea, cba, cdba, + cdeba, // IO and spatial derivatives + acb, acdb, acdeb, // NHWC derivatives + aBcd16b, aBcde16b, aBcd8b, aBcde8b, aBcd4b, + aBcde4b); // blocked layouts + return tag; + } +}; + +struct cudnn_pooling_fwd_t : public primitive_t { + struct pd_t : public pooling_fwd_pd_t, public cudnn_pooling_common_t { + using pooling_fwd_pd_t::pooling_fwd_pd_t; + + DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_pooling_fwd_t); + + status_t init(engine_t *engine) { + using namespace data_type; + using namespace prop_kind; + using namespace alg_kind; + using namespace format_tag; + + assert(engine->kind() == engine_kind::gpu); + auto src_dt = src_md()->data_type; + + bool ok = true && is_fwd(); + ok = ok && set_default_params() == status::success; + ok = ok + && utils::one_of(desc()->prop_kind, forward_training, + forward_inference); + ok = ok + && utils::one_of(desc()->alg_kind, pooling_max, + pooling_avg_include_padding, + pooling_avg_exclude_padding); + ok = ok && utils::one_of(src_dt, s8, f16, f32); + ok = ok + && IMPLICATION(utils::one_of(src_dt, f16), + desc()->prop_kind == forward_inference); + ok = ok + && IMPLICATION( + src_dt == s8, desc()->accum_data_type == s32); + ok = ok && attr()->has_default_values(); + ok = ok && blocking_ok(); + if (!ok) return status::unimplemented; + + bool is_training = desc_.prop_kind == forward_training; + if (is_training) init_ws(this, ws_md_); + + if (has_zero_dim_memory()) return status::success; + + pooling_impl_.reset(new cudnn_pooling_fwd_impl_t()); + return pooling_impl_->init(this); + } + + bool blocking_ok() const { + if (!utils::one_of(src_md()->data_type, data_type::s8) + && src_md()->format_desc.blocking.inner_nblks > 0) + return false; + + if (src_md()->format_desc.blocking.inner_nblks > 1) return false; + + if (utils::one_of(src_md()->data_type, data_type::s8) + && src_md()->format_desc.blocking.inner_nblks == 1) { + return memory_desc_matches_nchw_vect_c(src_md()) + && memory_desc_matches_nchw_vect_c(dst_md()); + } + + return true; + } + + std::shared_ptr pooling_impl_; + }; + + cudnn_pooling_fwd_t(const pd_t *apd) : primitive_t(apd) {} + + status_t execute(const exec_ctx_t &ctx) const override; + +private: + const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } +}; + +struct cudnn_pooling_bwd_t : public primitive_t { + struct pd_t : public pooling_bwd_pd_t, public cudnn_pooling_common_t { + using pooling_bwd_pd_t::pooling_bwd_pd_t; + + DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_pooling_bwd_t); + + status_t init(engine_t *engine) { + using namespace prop_kind; + using namespace alg_kind; + using namespace format_tag; + assert(engine->kind() == engine_kind::gpu); + + bool ok = true && !is_fwd() + && set_default_params() == status::success + && desc()->prop_kind == backward_data + && utils::one_of(desc()->alg_kind, pooling_max, + pooling_avg_include_padding, + pooling_avg_exclude_padding) + && (utils::everyone_is(data_type::f32, + diff_dst_md()->data_type, + diff_src_md()->data_type) + || utils::everyone_is(data_type::f16, + diff_dst_md()->data_type, + diff_src_md()->data_type)) + && attr()->has_default_values() && no_blocking(); + if (!ok) return status::unimplemented; + + init_mem_by_tag(get_tag(diff_dst_md_), diff_src_md_); + + init_ws(this, ws_md_); + if (!compare_ws(hint_fwd_pd_)) return status::unimplemented; + + if (has_zero_dim_memory()) { return status::success; }; + + pooling_impl_.reset(new cudnn_pooling_bwd_impl_t()); + return pooling_impl_->init(this); + } + + bool no_blocking() const { + return diff_src_md()->format_desc.blocking.inner_nblks + + diff_dst_md()->format_desc.blocking.inner_nblks + == 0; + } + + std::shared_ptr pooling_impl_; + }; + + cudnn_pooling_bwd_t(const pd_t *apd) : primitive_t(apd) {} + + status_t execute(const exec_ctx_t &ctx) const override; + +private: + const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } +}; + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/nvidia/cudnn_pooling_impl.hpp b/src/gpu/nvidia/cudnn_pooling_impl.hpp new file mode 100644 index 00000000000..cd21283c882 --- /dev/null +++ b/src/gpu/nvidia/cudnn_pooling_impl.hpp @@ -0,0 +1,234 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_NVIDIA_CUDNN_POOLING_IMPL_HPP +#define GPU_NVIDIA_CUDNN_POOLING_IMPL_HPP + +#include + +#include "gpu/nvidia/sycl_cuda_utils.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +struct cudnn_pooling_impl_base_t { + virtual status_t init(const pooling_pd_t *pd) = 0; + + virtual ~cudnn_pooling_impl_base_t() { + for (size_t i = 0; i < NUM_IO; ++i) { + if (tensor_descs_[i]) { + CUDNN_EXECUTE_FUNC_V( + cudnnDestroyTensorDescriptor, tensor_descs_[i]); + } + } + + if (pool_desc_) { + CUDNN_EXECUTE_FUNC_V(cudnnDestroyPoolingDescriptor, pool_desc_); + } + } + + virtual void execute(cudnnHandle_t handle, void *x, void *y, void *ws_x, + void *ws_y) const = 0; + +protected: + status_t init_common(const pooling_pd_t *pd) { + ndims_ = std::max(4, pd->ndims()); + kernel_ndims_ = ndims_ - 2; + + // Only 1D, 2D and 3D pooling is supported by cuDNN + if (kernel_ndims_ > 3) { return status::unimplemented; } + + // cuDNN requires symmetric padding, however it seems that + // configurations where padding in the beginning > padding at the end of + // dimensions work as expected. When padding at the end of any dimension + // > padding in the beginning of that dimension the results are wrong + // since the data is rearranged incorrectly due to the limitation that + // padding has to be the same. This applies to configurations which use + // the "average include padding" algorithm. Therefore, such + // configurations return status::unimplemented since the results are + // wrong. + if (pd->desc()->alg_kind == alg_kind::pooling_avg_include_padding + && (pd->padL() < pd->padR() || pd->padT() < pd->padB() + || pd->padFront() < pd->padBack())) { + return status::unimplemented; + } + + is_training_ = pd->desc()->prop_kind == prop_kind::forward_training; + bool is_fwd = pd->is_fwd(); + auto src_md = is_fwd ? pd->src_md() : pd->diff_src_md(); + auto dst_md = is_fwd ? pd->dst_md() : pd->diff_dst_md(); + + if (has_zero_dims(src_md->dims, pd->ndims()) + || has_zero_dims(dst_md->dims, pd->ndims())) { + return status::success; + } + + if (is_training_) { + auto src_wrap = memory_desc_wrapper(src_md); + auto dst_wrap = memory_desc_wrapper(dst_md); + x_size_bytes_ = src_wrap.size(); + y_size_bytes_ = dst_wrap.size(); + } + + convert_dims(src_md->padded_dims, dims_[src], pd->ndims()); + convert_dims(dst_md->padded_dims, dims_[dst], pd->ndims()); + + convert_dims(src_md->format_desc.blocking.strides, strides_[src], + pd->ndims()); + convert_dims(dst_md->format_desc.blocking.strides, strides_[dst], + pd->ndims()); + + convert_dims(pd->desc()->kernel, kernel_dims_, kernel_ndims_); + + // If 1D pooling + if (pd->ndims() == 3) { + // Convert to [n, c, 1, w] since the current format is + // [n, c, w, 1] + dims_[src][3] = dims_[src][2]; + dims_[src][2] = 1; + + dims_[dst][3] = dims_[dst][2]; + dims_[dst][2] = 1; + + // Set kernel dimensions to [1, kw] + kernel_dims_[1] = kernel_dims_[0]; + kernel_dims_[0] = 1; + } + + if (ndims_ == 4) { + kernel_padding_[0] = static_cast(pd->padT()); + kernel_padding_[1] = static_cast(pd->padL()); + + kernel_strides_[0] = static_cast(pd->KSH()); + kernel_strides_[1] = static_cast(pd->KSW()); + } else { + kernel_padding_[0] = static_cast(pd->padFront()); + kernel_padding_[1] = static_cast(pd->padT()); + kernel_padding_[2] = static_cast(pd->padL()); + + kernel_strides_[0] = static_cast(pd->KSD()); + kernel_strides_[1] = static_cast(pd->KSH()); + kernel_strides_[2] = static_cast(pd->KSW()); + } + + CHECK(convert_data_type(src_md, &data_types_[src])); + CHECK(convert_data_type(dst_md, &data_types_[dst])); + + CHECK(convert_alg_kind(pd->desc()->alg_kind, &pool_mode_)); + + cudnnTensorFormat_t src_format, dst_format; + CHECK(get_format(src_md, src_format)); + CHECK(get_format(dst_md, dst_format)); + + CHECK(create_and_set_tensor_descriptor_ex(&tensor_descs_[src], + src_format, data_types_[src], ndims_, dims_[src])); + CHECK(create_and_set_tensor_descriptor_ex(&tensor_descs_[dst], + dst_format, data_types_[dst], ndims_, dims_[dst])); + + CHECK(create_and_set_pooling_descriptor(pd)); + + return status::success; + } + + status_t create_and_set_pooling_descriptor(const pooling_pd_t *pd) { + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnCreatePoolingDescriptor, &pool_desc_)); + + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetPoolingNdDescriptor, pool_desc_, + pool_mode_, CUDNN_PROPAGATE_NAN, kernel_ndims_, kernel_dims_, + kernel_padding_, kernel_strides_)); + + return status::success; + } + + status_t convert_alg_kind( + alg_kind_t alg_kind, cudnnPoolingMode_t *cudnn_alg_kind) const { + switch (alg_kind) { + case alg_kind::pooling_max: + *cudnn_alg_kind = CUDNN_POOLING_MAX; + break; + case alg_kind::pooling_avg_include_padding: + *cudnn_alg_kind = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; + break; + case alg_kind::pooling_avg_exclude_padding: + *cudnn_alg_kind = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; + break; + default: return status::unimplemented; + } + + return status::success; + } + + enum io { src = 0, dst, NUM_IO }; + cudnnDataType_t data_types_[NUM_IO]; + cudnnTensorDescriptor_t tensor_descs_[NUM_IO] = {}; + cudnnPoolingDescriptor_t pool_desc_; + cudnnPoolingMode_t pool_mode_ = CUDNN_POOLING_MAX; + int dims_[NUM_IO][DNNL_MAX_NDIMS]; + int strides_[NUM_IO][DNNL_MAX_NDIMS]; + int kernel_dims_[DNNL_MAX_NDIMS]; + int kernel_padding_[DNNL_MAX_NDIMS]; + int kernel_strides_[DNNL_MAX_NDIMS]; + const float alpha_ = 1.f, beta_ = 0.f; + int ndims_, kernel_ndims_; + bool is_training_ = false; + std::size_t x_size_bytes_ = 0, y_size_bytes_ = 0; +}; + +struct cudnn_pooling_fwd_impl_t : public cudnn_pooling_impl_base_t { + status_t init(const pooling_pd_t *pd) override { + return cudnn_pooling_impl_base_t::init_common(pd); + } + + void execute(cudnnHandle_t handle, void *x, void *y, void *ws_x, + void *ws_y) const override { + + CUDNN_EXECUTE_FUNC(cudnnPoolingForward, handle, pool_desc_, &alpha_, + tensor_descs_[src], x, &beta_, tensor_descs_[dst], y); + + if (is_training_) { + // Copy x and y into workspace so that they can be used + // in the backward pass + cudnnAddTensor(handle, &alpha_, tensor_descs_[src], x, &beta_, + tensor_descs_[src], ws_x); + cudnnAddTensor(handle, &alpha_, tensor_descs_[dst], y, &beta_, + tensor_descs_[dst], ws_y); + } + } +}; + +struct cudnn_pooling_bwd_impl_t : public cudnn_pooling_impl_base_t { + status_t init(const pooling_pd_t *pd) override { + return cudnn_pooling_impl_base_t::init_common(pd); + } + + void execute(cudnnHandle_t handle, void *dx, void *dy, void *ws_x, + void *ws_y) const override { + + CUDNN_EXECUTE_FUNC(cudnnPoolingBackward, handle, pool_desc_, &alpha_, + tensor_descs_[dst], ws_y, tensor_descs_[dst], dy, + tensor_descs_[src], ws_x, &beta_, tensor_descs_[src], dx); + } +}; + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/nvidia/cudnn_reorder.cpp b/src/gpu/nvidia/cudnn_reorder.cpp new file mode 100644 index 00000000000..0fd6cd3a161 --- /dev/null +++ b/src/gpu/nvidia/cudnn_reorder.cpp @@ -0,0 +1,55 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "gpu/nvidia/cudnn_reorder.hpp" +#include "gpu/nvidia/sycl_cuda_scoped_context.hpp" +#include "gpu/nvidia/sycl_cuda_stream.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +status_t cudnn_reorder_t::execute(const exec_ctx_t &ctx) const { + memory_desc_wrapper wrap(pd()->src_md()); + if (wrap.size() == 0) { return status::success; } + + nvidia::sycl_cuda_stream_t *cuda_stream + = utils::downcast(ctx.stream()); + return cuda_stream->interop_task([&](cl::sycl::handler &cgh) { + auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC); + auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST); + + cgh.interop_task([=](const cl::sycl::interop_handler &ih) { + auto &sycl_engine = *utils::downcast( + cuda_stream->engine()); + auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine); + auto handle = cuda_stream->get_cudnn_handle(); + + auto a = sc.memory(ih, src_acc) + + pd()->reorder_->src_offset_in_bytes(); + auto b = sc.memory(ih, dst_acc) + + pd()->reorder_->dst_offset_in_bytes(); + pd()->reorder_->execute(handle, a, b); + }); + }); +} + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl diff --git a/src/gpu/nvidia/cudnn_reorder.hpp b/src/gpu/nvidia/cudnn_reorder.hpp new file mode 100644 index 00000000000..d5d43d65f4a --- /dev/null +++ b/src/gpu/nvidia/cudnn_reorder.hpp @@ -0,0 +1,122 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_NVIDIA_CUDNN_REORDER_HPP +#define GPU_NVIDIA_CUDNN_REORDER_HPP + +#include "common/memory_desc_wrapper.hpp" +#include "common/primitive.hpp" +#include "common/reorder_pd.hpp" +#include "gpu/nvidia/cudnn_reorder_impl.hpp" +#include "gpu/nvidia/sycl_cuda_engine.hpp" +#include "gpu/nvidia/sycl_cuda_utils.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +struct cudnn_reorder_t : public primitive_t { + using primitive_t::primitive_t; + + struct pd_t : public reorder_pd_t { + using reorder_pd_t::reorder_pd_t; + DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_reorder_t); + + static status_t create(reorder_pd_t **reorder_pd, engine_t *engine, + const primitive_attr_t *attr, engine_t *src_engine, + const memory_desc_t *src_md, engine_t *dst_engine, + const memory_desc_t *dst_md) { + auto _pd = new pd_t(attr, src_engine->kind(), src_md, + dst_engine->kind(), dst_md); + if (_pd == nullptr) return status::out_of_memory; + if (_pd->init(engine, src_engine, dst_engine) != status::success) { + delete _pd; + return status::unimplemented; + } + _pd->init_scratchpad_md(); + return safe_ptr_assign(*reorder_pd, _pd); + } + + // Function to verify data and memory format + bool valid_data_n_mem_format() const { + bool ok = utils::one_of(src_md()->data_type, data_type::s8, + data_type::f16, data_type::f32) + && utils::one_of(dst_md()->data_type, data_type::s8, + data_type::f16, data_type::f32); + + // Nvidia only supports blocking for Int8 + if (!utils::one_of(src_md()->data_type, data_type::s8) + && src_md()->format_desc.blocking.inner_nblks > 0) + return false; + if (!utils::one_of(dst_md()->data_type, data_type::s8) + && dst_md()->format_desc.blocking.inner_nblks > 0) + return false; + + // Nvidia supports blocking only on channel dimension C + if (dst_md()->format_desc.blocking.inner_nblks > 1 + || src_md()->format_desc.blocking.inner_nblks > 1) + return false; + if (utils::one_of(src_md()->data_type, data_type::s8) + && src_md()->format_desc.blocking.inner_nblks == 1) { + ok = ok && memory_desc_matches_nchw_vect_c(src_md()); + } + int blks = dst_md()->format_desc.blocking.inner_nblks; + if (utils::one_of(dst_md()->data_type, data_type::s8) + && blks == 1) { + ok = ok && memory_desc_matches_nchw_vect_c(dst_md()); + } + return ok; + } + + bool check_scales_mask() const { + // cuDNN does not support scaling per dimension. + if (attr()->output_scales_.mask_ != 0) { return false; } + return true; + } + + status_t init( + engine_t *engine, engine_t *src_engine, engine_t *dst_engine) { + bool ok = true && (engine == dst_engine) + && (src_engine->kind() == engine_kind::gpu) + && valid_data_n_mem_format() && check_scales_mask(); + if (!ok) return status::unimplemented; + if (has_different_block_size(src_md(), dst_md())) { + reorder_.reset(new cudnn_reorder_ex_t()); + } else { + reorder_.reset(new cudnn_reorder_stride_t()); + } + + return reorder_->init(this); + } + std::shared_ptr reorder_; + }; + + cudnn_reorder_t(const pd_t *apd) : primitive_t(apd) {} + + status_t execute(const exec_ctx_t &ctx) const override; + +private: + const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } +}; + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/nvidia/cudnn_reorder_impl.cpp b/src/gpu/nvidia/cudnn_reorder_impl.cpp new file mode 100644 index 00000000000..b07e3294a3f --- /dev/null +++ b/src/gpu/nvidia/cudnn_reorder_impl.cpp @@ -0,0 +1,46 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "common/engine.hpp" +#include "gpu/nvidia/cudnn_reorder.hpp" +#include "gpu/nvidia/sycl_cuda_engine.hpp" +#include "gpu/ocl/cross_engine_reorder.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +namespace { + +using rpd_create_f = dnnl::impl::engine_t::reorder_primitive_desc_create_f; + +const rpd_create_f cuda_reorder_impl_list[] + = {gpu::ocl::cross_engine_reorder_t::pd_t::create, + cudnn_reorder_t::pd_t::create, nullptr}; +} // namespace + +const rpd_create_f * +cuda_gpu_engine_impl_list_t::get_reorder_implementation_list( + const memory_desc_t *, const memory_desc_t *) { + return cuda_reorder_impl_list; +} + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl diff --git a/src/gpu/nvidia/cudnn_reorder_impl.hpp b/src/gpu/nvidia/cudnn_reorder_impl.hpp new file mode 100644 index 00000000000..ea3118a0b32 --- /dev/null +++ b/src/gpu/nvidia/cudnn_reorder_impl.hpp @@ -0,0 +1,182 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_NVIDIA_CUDNN_REORDER_IMPL_HPP +#define GPU_NVIDIA_CUDNN_REORDER_IMPL_HPP + +#include "common/type_helpers.hpp" +#include "gpu/nvidia/sycl_cuda_utils.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +struct cudnn_reorder_generic_t { +public: + virtual status_t init(const reorder_pd_t *pd) = 0; + + virtual void execute(cudnnHandle_t handle, void *src, void *dst) const = 0; + + virtual ~cudnn_reorder_generic_t() { + CUDNN_EXECUTE_FUNC_V(cudnnDestroyTensorDescriptor, src_desc_); + CUDNN_EXECUTE_FUNC_V(cudnnDestroyTensorDescriptor, dst_desc_); + } + + int dst_offset_in_bytes() { return dst_offset_in_bytes_; } + int src_offset_in_bytes() { return src_offset_in_bytes_; } + +protected: + cudnnDataType_t src_data_type_; + cudnnDataType_t dst_data_type_; + int ndims_; + int dims_[DNNL_MAX_NDIMS]; + cudnnTensorDescriptor_t src_desc_; + cudnnTensorDescriptor_t dst_desc_; + float alpha_, beta_; + int dst_offset_in_bytes_ = 0; + int src_offset_in_bytes_ = 0; +}; + +// This structure is used when the memory format includes blocking +struct cudnn_reorder_ex_t : public cudnn_reorder_generic_t { +public: + status_t init(const reorder_pd_t *pd) override { + // If any of the dimensions are 0 we should not continue with creating + // cudnn descriptors + memory_desc_wrapper wrap(pd->src_md()); + if (wrap.size() == 0) { return status::success; } + // Validity checks + assert(pd->dst_md()->ndims == pd->src_md()->ndims); + + get_format(pd->src_md(), src_format_); + get_format(pd->dst_md(), dst_format_); + dst_offset_in_bytes_ = pd->dst_md()->offset0 + * types::data_type_size(pd->dst_md()->data_type); + src_offset_in_bytes_ = pd->src_md()->offset0 + * types::data_type_size(pd->src_md()->data_type); + alpha_ = pd->alpha(); + beta_ = pd->beta(); + + CHECK(convert_data_type(pd->src_md(), &src_data_type_)); + CHECK(convert_data_type(pd->dst_md(), &dst_data_type_)); + + convert_dims(pd->src_md()->padded_dims, dims_, pd->src_md()->ndims); + + ndims_ = pd->dst_md()->ndims > 4 ? pd->dst_md()->ndims : 4; + + // Create and set tensor transform descriptor + CHECK(CUDNN_EXECUTE_FUNC_S( + cudnnCreateTensorTransformDescriptor, &trans_desc_)); + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetTensorTransformDescriptor, + trans_desc_, ndims_, dst_format_, nullptr, nullptr, nullptr, + cudnnFoldingDirection_t::CUDNN_TRANSFORM_FOLD)); + // Create and set source tensor descriptor + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnCreateTensorDescriptor, &src_desc_)); + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetTensorNdDescriptorEx, src_desc_, + src_format_, src_data_type_, ndims_, dims_)); + // Create and set destination tensor descriptor + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnCreateTensorDescriptor, &dst_desc_)); + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetTensorNdDescriptorEx, dst_desc_, + dst_format_, dst_data_type_, ndims_, dims_)); + return status::success; + } + + void execute(cudnnHandle_t handle, void *src, void *dst) const override { + // cudnnTransformTensorEx() function is required to support blocking. + // It requires the output tensor to be in cuDNN supported format. + CUDNN_EXECUTE_FUNC(cudnnTransformTensorEx, handle, trans_desc_, &alpha_, + src_desc_, src, &beta_, dst_desc_, dst); + } + + ~cudnn_reorder_ex_t() { + CUDNN_EXECUTE_FUNC_V( + cudnnDestroyTensorTransformDescriptor, trans_desc_); + } + +private: + cudnnTensorFormat_t src_format_; + cudnnTensorFormat_t dst_format_; + cudnnTensorTransformDescriptor_t trans_desc_; + + using cudnn_reorder_generic_t::cudnn_reorder_generic_t; +}; + +// This structure is used when the memory format does not include blocking +struct cudnn_reorder_stride_t : public cudnn_reorder_generic_t { +public: + status_t init(const reorder_pd_t *pd) override { + // If any of the dimensions are 0 we should not continue with creating + // cudnn descriptors + memory_desc_wrapper wrap(pd->src_md()); + if (wrap.size() == 0) { return status::success; } + + // Validity checks + assert(pd->dst_md()->ndims == pd->src_md()->ndims); + dst_offset_in_bytes_ = pd->dst_md()->offset0 + * types::data_type_size(pd->dst_md()->data_type); + src_offset_in_bytes_ = pd->src_md()->offset0 + * types::data_type_size(pd->src_md()->data_type); + alpha_ = pd->alpha(); + beta_ = pd->beta(); + + convert_dims(pd->dst_md()->dims, dims_, pd->dst_md()->ndims); + convert_dims(pd->src_md()->format_desc.blocking.strides, src_strides_, + pd->src_md()->ndims); + convert_dims(pd->dst_md()->format_desc.blocking.strides, dst_strides_, + pd->dst_md()->ndims); + adjust_dim_for_dnn(dims_, pd->dst_md()->ndims, pd->src_md()); + adjust_stride_for_dnn(src_strides_, pd->dst_md()->ndims, pd->src_md()); + adjust_stride_for_dnn(dst_strides_, pd->dst_md()->ndims, pd->dst_md()); + ndims_ = pd->dst_md()->ndims >= 4 ? pd->dst_md()->ndims + + pd->dst_md()->format_desc.blocking.inner_nblks + : 4; + bool vectorized = has_different_block_size(pd->src_md(), pd->dst_md()); + CHECK(convert_data_type(pd->src_md(), &src_data_type_, vectorized)); + CHECK(convert_data_type(pd->dst_md(), &dst_data_type_, vectorized)); + // Create and set source tensor descriptor + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnCreateTensorDescriptor, &src_desc_)); + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetTensorNdDescriptor, src_desc_, + src_data_type_, ndims_, dims_, src_strides_)); + // Create and set destination tensor descriptor + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnCreateTensorDescriptor, &dst_desc_)); + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetTensorNdDescriptor, dst_desc_, + dst_data_type_, ndims_, dims_, dst_strides_)); + return status::success; + } + + void execute(cudnnHandle_t handle, void *src, void *dst) const override { + // We don't need to specify the format (deducible using the strides) + // in case of cudnnTransformTensor(). + // For example, this is useful when converting from abcd to bacd + CUDNN_EXECUTE_FUNC(cudnnTransformTensor, handle, &alpha_, src_desc_, + src, &beta_, dst_desc_, dst); + } + +private: + int src_strides_[DNNL_MAX_NDIMS]; + int dst_strides_[DNNL_MAX_NDIMS]; + + using cudnn_reorder_generic_t::cudnn_reorder_generic_t; +}; + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/nvidia/cudnn_resampling.cpp b/src/gpu/nvidia/cudnn_resampling.cpp new file mode 100644 index 00000000000..318bbee2260 --- /dev/null +++ b/src/gpu/nvidia/cudnn_resampling.cpp @@ -0,0 +1,94 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "sycl/sycl_buffer_memory_storage.hpp" + +#include "gpu/nvidia/cudnn_resampling.hpp" +#include "gpu/nvidia/sycl_cuda_scoped_context.hpp" +#include "gpu/nvidia/sycl_cuda_stream.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +status_t cudnn_resampling_fwd_t::execute(const exec_ctx_t &ctx) const { + if (memory_desc_wrapper(pd()->src_md()).has_zero_dim()) + return status::success; + + nvidia::sycl_cuda_stream_t *cuda_stream + = utils::downcast(ctx.stream()); + + cuda_stream->interop_task([&](cl::sycl::handler &cgh) { + auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC); + auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST); + auto grid_acc = buffer(grid_storage_.get()) + .get_access(cgh); + cgh.interop_task([=](const cl::sycl::interop_handler &ih) { + auto &sycl_engine = *utils::downcast( + cuda_stream->engine()); + auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine); + auto handle = cuda_stream->get_cudnn_handle(); + std::vector args; + + args.push_back(sc.memory(ih, src_acc)); + args.push_back(sc.memory(ih, grid_acc)); + args.push_back(sc.memory(ih, dst_acc)); + + pd()->resampling_impl_->execute(handle, args); + }); + }); + + return status::success; +} + +status_t cudnn_resampling_bwd_t::execute(const exec_ctx_t &ctx) const { + if (memory_desc_wrapper(pd()->diff_src_md()).has_zero_dim()) + return status::success; + + nvidia::sycl_cuda_stream_t *cuda_stream + = utils::downcast(ctx.stream()); + + cuda_stream->interop_task([&](cl::sycl::handler &cgh) { + auto diff_src_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_SRC); + auto diff_dst_acc = CTX_IN_ACCESSOR(DNNL_ARG_DIFF_DST); + auto grid_acc = buffer(grid_storage_.get()) + .get_access(cgh); + auto diff_grid_acc + = CTX_SCRATCH_ACCESSOR(memory_tracking::names::key_none); + cgh.interop_task([=](const cl::sycl::interop_handler &ih) { + auto &sycl_engine = *utils::downcast( + cuda_stream->engine()); + auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine); + auto handle = cuda_stream->get_cudnn_handle(); + std::vector args; + args.push_back(sc.memory(ih, diff_src_acc)); + args.push_back(sc.memory(ih, diff_dst_acc)); + args.push_back(sc.memory(ih, grid_acc)); + args.push_back(sc.memory(ih, diff_grid_acc)); + + pd()->resampling_impl_->execute(handle, args); + }); + }); + + return status::success; +} + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl diff --git a/src/gpu/nvidia/cudnn_resampling.hpp b/src/gpu/nvidia/cudnn_resampling.hpp new file mode 100644 index 00000000000..10f04f92746 --- /dev/null +++ b/src/gpu/nvidia/cudnn_resampling.hpp @@ -0,0 +1,269 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_NVIDIA_CUDNN_RESAMPLING_HPP +#define GPU_NVIDIA_CUDNN_RESAMPLING_HPP + +#include +#include + +#include "common/c_types_map.hpp" +#include "common/primitive.hpp" +#include "common/resampling_pd.hpp" +#include "common/type_helpers.hpp" +#include "gpu/nvidia/sycl_cuda_engine.hpp" +#include "gpu/nvidia/sycl_cuda_scoped_context.hpp" +#include "gpu/nvidia/sycl_cuda_stream.hpp" +#include "gpu/nvidia/sycl_cuda_utils.hpp" + +#include "gpu/nvidia/cudnn_resampling_impl.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +struct cudnn_resampling_pd_base_t { +protected: + status_t init_mem_by_tag(format_tag_t tag, memory_desc_t &md) { + if (tag == format_tag::undef) return status::unimplemented; + CHECK(memory_desc_init_by_tag(md, tag)); + return status::success; + } +}; + +struct cudnn_resampling_base_t : public primitive_t { +protected: + using primitive_t::primitive_t; + template + struct theta_t { + data_t s0_, i_, tx_; + data_t j_, s1_, ty_; + theta_t(data_t s0, data_t i, data_t tx, data_t j, data_t s1, data_t ty) + : s0_(s0), i_(i), tx_(tx), j_(j), s1_(s1), ty_(ty) {} + }; + + cl::sycl::buffer &buffer(memory_storage_t *mem_storage) { + return utils::downcast( + mem_storage) + ->buffer(); + } + cl::sycl::buffer &buffer(memory_storage_t *mem_storage) const { + return utils::downcast( + mem_storage) + ->buffer(); + } + template + status_t prepare_coordinate_grid(engine_t *engine, const pd_t *pd) { + using io = cudnn_resampling_impl_base_t::io; + int ndims = pd->resampling_impl_->ndims(); + data_t OW = pd->resampling_impl_->dims_[io::dst][ndims - 1], + IW = pd->resampling_impl_->dims_[io::src][ndims - 1], + OH = pd->resampling_impl_->dims_[io::dst][ndims - 2], + IH = pd->resampling_impl_->dims_[io::src][ndims - 2]; + // cudnn uses the normalized value between -1<=(xsi, ysi)<= 1 for + // building the grid. Therefore, scaling parameter for tau_theta must be + // adjusted for computing the normalized value per grid. + data_t w = 1; + if (IW != 1 && IW != OW) w = IW * (OW - 1) / (OW * (IW - 1)); + + data_t h = 1; + if (IH != 1 && IH != OH) h = IH * (OH - 1) / (OH * (IH - 1)); + + // the taue of theta size is fixed in cudnn + int tau_thea_size = 2 * 3; + auto theta_size = pd->MB(); + auto tau_theta = theta_t {w, 0.f, 0.f, 0.f, h, 0.f}; + std::vector> theta_data(theta_size, tau_theta); + + auto grid_size = pd->MB() * pd->OH() * pd->OW() * 2; + auto sycl_engine = utils::downcast(engine); + + auto theta_size_in_byte = tau_thea_size * theta_size * sizeof(data_t); + auto grid_size_in_byte = grid_size * sizeof(data_t); + + memory_storage_t *mem_grid_ptr; + CHECK(sycl_engine->create_memory_storage(&mem_grid_ptr, + memory_flags_t::alloc, grid_size_in_byte, nullptr)); + grid_storage_.reset(mem_grid_ptr); + + memory_storage_t *mem_theta_ptr; + CHECK(sycl_engine->create_memory_storage(&mem_theta_ptr, + memory_flags_t::alloc, theta_size_in_byte, nullptr)); + theta_storage_.reset(mem_theta_ptr); + + stream_t *service_stream; + CHECK(sycl_engine->get_service_stream(service_stream)); + + auto cuda_stream + = utils::downcast(service_stream); + auto event = copy(cuda_stream->queue(), + reinterpret_cast(theta_data.data()), + buffer(theta_storage_.get())); + auto &st_desc_ = pd->resampling_impl_->st_desc_; + cuda_stream->interop_task([&](cl::sycl::handler &cgh) { + cgh.depends_on(event); + auto theta_acc + = buffer(theta_storage_.get()) + .get_access(cgh); + auto grid_acc + = buffer(grid_storage_.get()) + .get_access(cgh); + + cgh.interop_task([=](const cl::sycl::interop_handler &ih) { + // scoped context will make sure the top of the stack context is + // the engine context while creating the cublas handle. + auto &s_engine = *utils::downcast(engine); + cuda_sycl_scoped_context_handler_t sc(s_engine); + auto handle = cuda_stream->get_cudnn_handle(); + auto theta = sc.memory(ih, theta_acc); + auto grid = sc.memory(ih, grid_acc); + CUDNN_EXECUTE_FUNC(cudnnSpatialTfGridGeneratorForward, handle, + st_desc_, theta, grid); + }); + }); + + // cudnn requires the grid data to be normalized between (-1, -1) <= + // (xsi, ysi) <= (1,1) when the value is outside of the boundary, cudnn + // assume the values are 0, while oneDNN uses the boundary values. So we + // clamp the outside of the boundary values to the boundary,. This will + // fix the upsampling issue. + std::vector unbound_raw_grid(grid_size); + auto event2 = copy(cuda_stream->queue(), buffer(grid_storage_.get()), + reinterpret_cast(unbound_raw_grid.data())); + event2.wait(); + for (int i = 0; i < grid_size; i++) { + if (std::fabs(unbound_raw_grid[i]) > 1) + unbound_raw_grid[i] = unbound_raw_grid[i] + / (std::fabs(unbound_raw_grid[i])); + } + + auto event3 = copy(cuda_stream->queue(), + reinterpret_cast(unbound_raw_grid.data()), + buffer(grid_storage_.get())); + event3.wait(); + return status::success; + } + std::unique_ptr grid_storage_; + std::unique_ptr theta_storage_; +}; + +struct cudnn_resampling_fwd_t : public cudnn_resampling_base_t { + using cudnn_resampling_base_t::cudnn_resampling_base_t; + struct pd_t : public resampling_fwd_pd_t, + public cudnn_resampling_pd_base_t { + using cudnn_resampling_pd_base_t::cudnn_resampling_pd_base_t; + using resampling_fwd_pd_t::resampling_fwd_pd_t; + DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_resampling_fwd_t); + + status_t init(engine_t *engine) { + using namespace data_type; + using namespace format_tag; + + assert(engine->kind() == engine_kind::gpu); + + bool ok = desc()->alg_kind == alg_kind::resampling_linear + && is_fwd() && utils::one_of(src_md()->data_type, f32, f16) + && src_md()->data_type == dst_md()->data_type + && set_default_params() == status::success + && attr()->has_default_values(); + if (!ok) return status::unimplemented; + + // src must have a tag and src must follow the same tag + format_tag_t dat_tag = memory_desc_matches_one_of_tag( + *src_md(), ncw, nchw, nwc, nhwc); + if (dat_tag == format_tag::undef) return status::unimplemented; + if (!memory_desc_matches_tag(*dst_md(), dat_tag)) { + return status::unimplemented; + } + + resampling_impl_.reset(new cudnn_resampling_fwd_impl_t()); + return resampling_impl_->init(this); + } + + std::shared_ptr resampling_impl_; + }; + + status_t init(engine_t *engine) override { + status_t status; + auto wrap = memory_desc_wrapper(pd()->src_md()); + switch (wrap.data_type()) { + case data_type::f32: + status = prepare_coordinate_grid(engine, pd()); + break; + case data_type::f16: + status = prepare_coordinate_grid(engine, pd()); + break; + default: status = status::unimplemented; + } + return status; + } + + status_t execute(const exec_ctx_t &ctx) const override; + +private: + const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } +}; + +struct cudnn_resampling_bwd_t : public cudnn_resampling_base_t { + using cudnn_resampling_base_t::cudnn_resampling_base_t; + struct pd_t : public resampling_bwd_pd_t, + public cudnn_resampling_pd_base_t { + using cudnn_resampling_pd_base_t::cudnn_resampling_pd_base_t; + using resampling_bwd_pd_t::resampling_bwd_pd_t; + DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_resampling_bwd_t); + + status_t init(engine_t *engine) { + using namespace data_type; + using namespace format_tag; + + assert(engine->kind() == engine_kind::gpu); + bool ok = desc()->alg_kind == alg_kind::resampling_linear + && !is_fwd() && utils::one_of(diff_src_md()->data_type, f32) + && diff_src_md()->data_type == diff_dst_md()->data_type + && set_default_params() == status::success + && attr()->has_default_values(); + if (!ok) return status::unimplemented; + // dst must have a tag and src must follow the same tag + format_tag_t dat_tag = memory_desc_matches_one_of_tag( + *diff_dst_md(), ncw, nchw, nwc, nhwc); + if (dat_tag == format_tag::undef) return status::unimplemented; + if (!memory_desc_matches_tag(*diff_src_md(), dat_tag)) { + return status::unimplemented; + } + + resampling_impl_.reset(new cudnn_resampling_bwd_impl_t()); + return resampling_impl_->init(this); + } + std::shared_ptr resampling_impl_; + }; + status_t init(engine_t *engine) override { + return prepare_coordinate_grid(engine, pd()); + } + + status_t execute(const exec_ctx_t &ctx) const override; + +private: + const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } +}; + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/nvidia/cudnn_resampling_impl.hpp b/src/gpu/nvidia/cudnn_resampling_impl.hpp new file mode 100644 index 00000000000..acc94feb698 --- /dev/null +++ b/src/gpu/nvidia/cudnn_resampling_impl.hpp @@ -0,0 +1,171 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_NVIDIA_CUDNN_RESAMPLING_IMPL_HPP +#define GPU_NVIDIA_CUDNN_RESAMPLING_IMPL_HPP + +#include + +#include "gpu/nvidia/sycl_cuda_engine.hpp" +#include "gpu/nvidia/sycl_cuda_utils.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +struct cudnn_resampling_impl_base_t { + virtual ~cudnn_resampling_impl_base_t() { + for (int i = 0; i < NUM_IO; ++i) { + if (tensor_descs_[i]) { + CUDNN_EXECUTE_FUNC_V( + cudnnDestroyTensorDescriptor, tensor_descs_[i]); + } + } + + if (st_desc_) { + CUDNN_EXECUTE_FUNC_V( + cudnnDestroySpatialTransformerDescriptor, st_desc_); + } + } + + virtual status_t init(resampling_pd_t *pd) = 0; + + virtual void execute( + cudnnHandle_t handle, const std::vector &args) const = 0; + + int ndims() { return ndims_; } + + status_t create_and_set_st_desc() { + CHECK(CUDNN_EXECUTE_FUNC_S( + cudnnCreateSpatialTransformerDescriptor, &st_desc_)); + + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetSpatialTransformerNdDescriptor, + st_desc_, CUDNN_SAMPLER_BILINEAR, data_types_[dst], ndims_, + dims_[dst])); + + return status::success; + } + + enum io { src, dst, NUM_IO }; + int dims_[NUM_IO][DNNL_MAX_NDIMS]; + int strides_[NUM_IO][DNNL_MAX_NDIMS]; + cudnnDataType_t data_types_[NUM_IO]; + cudnnTensorDescriptor_t tensor_descs_[NUM_IO] = {}; + cudnnSpatialTransformerDescriptor_t st_desc_; + int ndims_; + const float alpha_ = 1.f, beta_ = 0.f; +}; + +struct cudnn_resampling_fwd_impl_t : public cudnn_resampling_impl_base_t { + status_t init(resampling_pd_t *pd) override { + ndims_ = std::max(4, pd->ndims()); + + if (ndims_ > 4) return status::unimplemented; + + cudnnTensorFormat_t src_format, dst_format; + CHECK(get_format(pd->src_md(), dst_format)); + CHECK(get_format(pd->dst_md(), src_format)); + convert_dims(pd->src_md()->padded_dims, dims_[src], pd->ndims()); + convert_dims(pd->src_md()->format_desc.blocking.strides, strides_[src], + pd->ndims(), 4, + (dst_format != CUDNN_TENSOR_NHWC ? 1 : dims_[src][1])); + convert_dims(pd->dst_md()->padded_dims, dims_[dst], pd->ndims()); + convert_dims(pd->dst_md()->format_desc.blocking.strides, strides_[dst], + pd->ndims(), 4, + (dst_format != CUDNN_TENSOR_NHWC ? 1 : dims_[dst][1])); + + CHECK(convert_data_type(pd->src_md(), &data_types_[src])); + CHECK(convert_data_type(pd->dst_md(), &data_types_[dst])); + + CHECK(create_and_set_tensor_descriptor(&tensor_descs_[src], + data_types_[src], ndims_, dims_[src], strides_[src])); + CHECK(create_and_set_tensor_descriptor(&tensor_descs_[dst], + data_types_[dst], ndims_, dims_[dst], strides_[dst])); + + CHECK(create_and_set_st_desc()); + return status::success; + } + + void execute(cudnnHandle_t handle, + const std::vector &args) const override { + + CUDNN_EXECUTE_FUNC(cudnnSpatialTfSamplerForward, handle, st_desc_, + &alpha_, tensor_descs_[src], args[0], args[1], &beta_, + tensor_descs_[dst], args[2]); + } +}; + +struct cudnn_resampling_bwd_impl_t : public cudnn_resampling_impl_base_t { + + status_t init(resampling_pd_t *pd) override { + ndims_ = std::max(4, pd->ndims()); + + if (ndims_ > 4) return status::unimplemented; + + cudnnTensorFormat_t src_format, dst_format; + CHECK(get_format(pd->diff_src_md(), dst_format)); + CHECK(get_format(pd->diff_dst_md(), src_format)); + convert_dims(pd->diff_src_md()->padded_dims, dims_[src], pd->ndims()); + convert_dims(pd->diff_src_md()->format_desc.blocking.strides, + strides_[src], pd->ndims(), 4, + (dst_format != CUDNN_TENSOR_NHWC ? 1 : dims_[src][1])); + convert_dims(pd->diff_dst_md()->padded_dims, dims_[dst], pd->ndims()); + convert_dims(pd->diff_dst_md()->format_desc.blocking.strides, + strides_[dst], pd->ndims(), 4, + (dst_format != CUDNN_TENSOR_NHWC ? 1 : dims_[dst][1])); + + CHECK(convert_data_type(pd->diff_src_md(), &data_types_[src])); + CHECK(convert_data_type(pd->diff_dst_md(), &data_types_[dst])); + + CHECK(create_and_set_tensor_descriptor(&tensor_descs_[src], + data_types_[src], ndims_, dims_[src], strides_[src])); + CHECK(create_and_set_tensor_descriptor(&tensor_descs_[dst], + data_types_[dst], ndims_, dims_[dst], strides_[dst])); + + CHECK(create_and_set_st_desc()); + auto wrap = memory_desc_wrapper(pd->diff_src_md()); + + auto grid_size = pd->MB() * pd->OH() * pd->OW() * 2; + auto grid_size_in_byte = grid_size * wrap.data_type_size(); + // cuDNN does not allow the dgrid to be NULL ptr. Although we dont + // need to compute dgrid since the theta is not comming from a + // local network, we have to set that since Nvidia does not accept + // so we allocate an scratchpad for dgrid + pd->scratchpad_registry().registrar().book( + memory_tracking::names::key_none, grid_size_in_byte, size_t(1)); + return status::success; + } + + void execute(cudnnHandle_t handle, + const std::vector &args) const override { + // we are not backpropagating for the grid here. + // So both alpha and beta are zero and the dgrid value + // wont be used + CUDNN_EXECUTE_FUNC(cudnnSpatialTfSamplerBackward, handle, st_desc_, + &alpha_, tensor_descs_[src], args[0], &beta_, + tensor_descs_[src], args[0], &beta_, tensor_descs_[dst], + args[1], args[2], &beta_, args[3]); + } +}; + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/nvidia/cudnn_softmax.cpp b/src/gpu/nvidia/cudnn_softmax.cpp new file mode 100644 index 00000000000..3b84c7e5e1a --- /dev/null +++ b/src/gpu/nvidia/cudnn_softmax.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "gpu/nvidia/cudnn_softmax.hpp" +#include "gpu/nvidia/sycl_cuda_scoped_context.hpp" +#include "gpu/nvidia/sycl_cuda_stream.hpp" +#include "sycl/sycl_buffer_memory_storage.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +status_t cudnn_softmax_fwd_t::execute(const exec_ctx_t &ctx) const { + if (memory_desc_wrapper(pd()->desc()->data_desc).has_zero_dim()) + return status::success; + + nvidia::sycl_cuda_stream_t *cuda_stream + = utils::downcast(ctx.stream()); + + return cuda_stream->interop_task([&](cl::sycl::handler &cgh) { + auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC); + auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST); + + cgh.interop_task([=](const cl::sycl::interop_handler &ih) { + std::vector args; + auto &sycl_engine = *utils::downcast( + cuda_stream->engine()); + auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine); + auto handle = cuda_stream->get_cudnn_handle(); + + args.push_back(sc.memory(ih, src_acc)); + args.push_back(sc.memory(ih, dst_acc)); + + pd()->softmax_impl_->execute(handle, args.data(), args.size()); + }); + }); +} + +status_t cudnn_softmax_bwd_t::execute(const exec_ctx_t &ctx) const { + if (memory_desc_wrapper(pd()->desc()->diff_desc).has_zero_dim()) + return status::success; + + nvidia::sycl_cuda_stream_t *cuda_stream + = utils::downcast(ctx.stream()); + + return cuda_stream->interop_task([&](cl::sycl::handler &cgh) { + auto dst_acc = CTX_IN_ACCESSOR(DNNL_ARG_DST); + auto diff_dst_acc = CTX_IN_ACCESSOR(DNNL_ARG_DIFF_DST); + auto diff_src_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_SRC); + + cgh.interop_task([=](const cl::sycl::interop_handler &ih) { + std::vector args; + auto &sycl_engine = *utils::downcast( + cuda_stream->engine()); + auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine); + auto handle = cuda_stream->get_cudnn_handle(); + + args.push_back(sc.memory(ih, dst_acc)); + args.push_back(sc.memory(ih, diff_dst_acc)); + args.push_back(sc.memory(ih, diff_src_acc)); + + pd()->softmax_impl_->execute(handle, args.data(), args.size()); + }); + }); +} + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl diff --git a/src/gpu/nvidia/cudnn_softmax.hpp b/src/gpu/nvidia/cudnn_softmax.hpp new file mode 100644 index 00000000000..3600f018956 --- /dev/null +++ b/src/gpu/nvidia/cudnn_softmax.hpp @@ -0,0 +1,116 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_NVIDIA_CUDNN_SOFTMAX_HPP +#define GPU_NVIDIA_CUDNN_SOFTMAX_HPP + +#include "cudnn.h" + +#include + +#include "common/primitive.hpp" +#include "common/softmax_pd.hpp" +#include "gpu/nvidia/cudnn_softmax_impl.hpp" +#include "gpu/nvidia/sycl_cuda_engine.hpp" +#include "gpu/nvidia/sycl_cuda_utils.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +struct cudnn_softmax_fwd_t : public primitive_t { + using primitive_t::primitive_t; + + struct pd_t : public softmax_fwd_pd_t { + using softmax_fwd_pd_t::softmax_fwd_pd_t; + + DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_softmax_fwd_t); + + status_t init(engine_t *) { + bool ok = true + && utils::one_of(desc()->prop_kind, + prop_kind::forward_inference, + prop_kind::forward_training) + && utils::one_of(desc()->data_desc.data_type, + data_type::f32, data_type::f16) + // Blocking is supported only for s8 and softmax does not + // support it. + && src_md()->format_desc.blocking.inner_nblks == 0 + && dst_md()->format_desc.blocking.inner_nblks == 0 + && attr()->has_default_values(); + + if (!ok) return status::unimplemented; + + softmax_impl_.reset(new cudnn_softmax_fwd_impl_t()); + + return softmax_impl_->init(this); + } + + std::shared_ptr softmax_impl_; + }; + + cudnn_softmax_fwd_t(const pd_t *apd) : primitive_t(apd) {} + + status_t execute(const exec_ctx_t &ctx) const override; + +private: + const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } +}; + +struct cudnn_softmax_bwd_t : public primitive_t { + using primitive_t::primitive_t; + + struct pd_t : public softmax_bwd_pd_t { + using softmax_bwd_pd_t::softmax_bwd_pd_t; + + DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_softmax_bwd_t); + + status_t init(engine_t *) { + bool ok = true && desc()->prop_kind == prop_kind::backward_data + && utils::one_of(desc()->data_desc.data_type, + data_type::f32, data_type::f16) + && set_default_formats_common() + // Blocking is not supported + && dst_md()->format_desc.blocking.inner_nblks == 0 + && diff_dst_md()->format_desc.blocking.inner_nblks == 0 + && attr()->has_default_values(); + + if (!ok) return status::unimplemented; + + softmax_impl_.reset(new cudnn_softmax_bwd_impl_t()); + + return softmax_impl_->init(this); + } + + std::shared_ptr softmax_impl_; + }; + + cudnn_softmax_bwd_t(const pd_t *apd) : primitive_t(apd) {} + + status_t execute(const exec_ctx_t &ctx) const override; + +private: + const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } +}; + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/nvidia/cudnn_softmax_impl.hpp b/src/gpu/nvidia/cudnn_softmax_impl.hpp new file mode 100644 index 00000000000..afe395ac44d --- /dev/null +++ b/src/gpu/nvidia/cudnn_softmax_impl.hpp @@ -0,0 +1,255 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_NVIDIA_CUDNN_SOFTMAX_IMPL_HPP +#define GPU_NVIDIA_CUDNN_SOFTMAX_IMPL_HPP + +#include "cudnn.h" + +#include "gpu/nvidia/sycl_cuda_utils.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +struct cudnn_softmax_impl_base_t { + cudnnDataType_t data_type; + int ndims; + cudnnSoftmaxAlgorithm_t alg_kind; + // cuDNN only supports softmax on channel dimension + cudnnSoftmaxMode_t mode = cudnnSoftmaxMode_t::CUDNN_SOFTMAX_MODE_CHANNEL; + // oneDNN softmax primitive doesn't support any post-ops or attributes, + // hence we can set alpha = 1 and beta = 0 for all cases + float alpha = 1.0f; + float beta = 0.0f; + + virtual ~cudnn_softmax_impl_base_t() {} + + virtual status_t init(const softmax_pd_t *pd) = 0; + + virtual void execute(cudnnHandle_t handle, void **x, int size) const = 0; + + // Mapping between dnnl algorithm and cuDNN softmax algorithm + status_t convert_alg_kind( + bool is_log_softmax, cudnnSoftmaxAlgorithm_t *cuda_alg_kind) const { + if (is_log_softmax) { + *cuda_alg_kind = cudnnSoftmaxAlgorithm_t::CUDNN_SOFTMAX_LOG; + } else { + *cuda_alg_kind = cudnnSoftmaxAlgorithm_t::CUDNN_SOFTMAX_ACCURATE; + } + return status::success; + } + + status_t convert_dims_softmax(const dims_t &orig_dims, int *modified_dims, + int axis, int ndims, format_tag_t tag, + cudnnTensorFormat_t &format) const { + + // Initialise all dims to 1 + for (int i = 0; i < 4; i++) { + modified_dims[i] = 1; + } + if (axis == 1) { + // Copy dimensions into the new array + format = tag == dnnl_nhwc ? cudnnTensorFormat_t::CUDNN_TENSOR_NHWC + : cudnnTensorFormat_t::CUDNN_TENSOR_NCHW; + int num_dims = ndims < 4 ? ndims : 4; + for (int i = 0; i < num_dims; i++) { + modified_dims[i] = orig_dims[i]; + } + for (int i = 4; i < ndims; i++) { + modified_dims[3] *= orig_dims[i]; + } + return status::success; + } + format = cudnnTensorFormat_t::CUDNN_TENSOR_NCHW; + switch (tag) { + case dnnl_cn: { + modified_dims[0] = orig_dims[1]; + modified_dims[1] = orig_dims[0]; + break; + } + case dnnl_nchw: { + switch (axis) { + case 0: + modified_dims[1] = orig_dims[axis]; + modified_dims[2] = orig_dims[1]; + for (int i = 2; i < ndims; i++) { + modified_dims[3] *= orig_dims[i]; + } + break; + default: { + for (int i = 0; i < axis; i++) { + modified_dims[0] *= orig_dims[i]; + } + modified_dims[1] = orig_dims[axis]; + if (axis == ndims - 1) { return status::success; } + for (int i = axis + 1; i < ndims; i++) { + modified_dims[2] *= orig_dims[i]; + } + break; + } + } + break; + } + case dnnl_nhwc: + switch (axis) { + case 0: + modified_dims[1] = orig_dims[0]; + for (int i = 1; i < ndims; i++) { + modified_dims[2] *= orig_dims[i]; + } + break; + case 2: + modified_dims[0] = orig_dims[0]; + modified_dims[1] = orig_dims[2]; + for (int i = 3; i < ndims; i++) { + modified_dims[2] *= orig_dims[i]; + } + modified_dims[3] = orig_dims[1]; + break; + case 3: + modified_dims[0] = orig_dims[0] * orig_dims[2]; + modified_dims[1] = orig_dims[3]; + modified_dims[2] = ndims == 4 ? 1 : orig_dims[4]; + modified_dims[3] = orig_dims[1]; + break; + } + break; + default: return status::unimplemented; + } + return status::success; + } + + status_t convert_tag(const memory_desc_t *md, format_tag_t &tag) const { + const memory_desc_wrapper mem_wrapper(md); + if (mem_wrapper.matches_one_of_tag(format_tag::ba)) { + tag = dnnl_cn; + } else if (mem_wrapper.matches_one_of_tag(format_tag::ab, + format_tag::abc, format_tag::abcd, format_tag::abcde, + format_tag::abcdef)) { + tag = dnnl_nchw; + } else if (mem_wrapper.matches_one_of_tag(format_tag::acb, + format_tag::acdb, format_tag::acdeb)) { + tag = dnnl_nhwc; + } else { + return status::unimplemented; + } + return status::success; + } +}; + +struct cudnn_softmax_fwd_impl_t : public cudnn_softmax_impl_base_t { + int dims[DNNL_MAX_NDIMS]; + cudnnTensorDescriptor_t tensor_desc; + cudnnTensorFormat_t format; + + status_t init(const softmax_pd_t *pd) override { + // If any of the dimensions are 0 we should not continue with + // creating cudnn descriptors + if (has_zero_dims(pd->src_md(0)->dims, pd->ndims())) { + return status::success; + } + + if (pd->ndims() > CUDNN_DIM_MAX) { return status::invalid_arguments; } + ndims = pd->ndims() < 4 ? 4 : pd->ndims(); + + format_tag_t tag; + CHECK(convert_tag(pd->src_md(), tag)); + CHECK(convert_dims_softmax(pd->src_md()->padded_dims, dims, pd->axis(), + pd->ndims(), tag, format)); + + convert_alg_kind(pd->is_logsoftmax(), &alg_kind); + + assert(pd->src_md()->data_type == pd->dst_md()->data_type); + + CHECK(convert_data_type(pd->src_md(), &data_type)); + + CHECK(create_and_set_tensor_descriptor_ex( + &tensor_desc, format, data_type, 4, dims)); + return status::success; + } + + void execute(cudnnHandle_t handle, void **x, int size) const override { + // Confirm that 2 arguments were passed, src and dst + assert(size == 2); + CUDNN_EXECUTE_FUNC(cudnnSoftmaxForward, handle, alg_kind, mode, &alpha, + tensor_desc, x[0], &beta, tensor_desc, x[1]); + } + + ~cudnn_softmax_fwd_impl_t() { + CUDNN_EXECUTE_FUNC_V(cudnnDestroyTensorDescriptor, tensor_desc); + } +}; + +struct cudnn_softmax_bwd_impl_t : public cudnn_softmax_impl_base_t { + int dims[DNNL_MAX_NDIMS]; + int dims_dst[DNNL_MAX_NDIMS]; + cudnnTensorDescriptor_t tensor_dst_desc; + cudnnTensorDescriptor_t tensor_diff_desc; + cudnnTensorFormat_t format; + + status_t init(const softmax_pd_t *pd) override { + // If any of the dimensions are 0 we should not continue with + // creating cudnn descriptors + if (memory_desc_wrapper(pd->desc()->diff_desc).has_zero_dim()) + return status::success; + + if (pd->ndims() > CUDNN_DIM_MAX) { return status::invalid_arguments; } + ndims = pd->ndims() < 4 ? 4 : pd->ndims(); + + format_tag_t tag; + CHECK(convert_tag(pd->dst_md(), tag)); + CHECK(convert_dims_softmax(pd->dst_md()->padded_dims, dims_dst, + pd->axis(), pd->ndims(), tag, format)); + CHECK(convert_dims_softmax(pd->diff_src_md()->padded_dims, dims, + pd->axis(), pd->ndims(), tag, format)); + + convert_alg_kind(pd->is_logsoftmax(), &alg_kind); + + assert(pd->diff_dst_md()->data_type == pd->dst_md()->data_type); + assert(pd->diff_dst_md()->data_type == pd->diff_src_md()->data_type); + + CHECK(convert_data_type(pd->dst_md(), &data_type)); + + CHECK(create_and_set_tensor_descriptor_ex( + &tensor_dst_desc, format, data_type, 4, dims_dst)); + CHECK(create_and_set_tensor_descriptor_ex( + &tensor_diff_desc, format, data_type, 4, dims)); + return status::success; + } + + void execute(cudnnHandle_t handle, void **x, int size) const override { + // Assert that 3 arguments were passed src, diff_dst and diff_src + assert(size == 3); + CUDNN_EXECUTE_FUNC(cudnnSoftmaxBackward, handle, alg_kind, mode, &alpha, + tensor_dst_desc, x[0], tensor_diff_desc, x[1], &beta, + tensor_diff_desc, x[2]); + } + + ~cudnn_softmax_bwd_impl_t() { + CUDNN_EXECUTE_FUNC_V(cudnnDestroyTensorDescriptor, tensor_dst_desc); + CUDNN_EXECUTE_FUNC_V(cudnnDestroyTensorDescriptor, tensor_diff_desc); + } +}; + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/nvidia/cudnn_sum.cpp b/src/gpu/nvidia/cudnn_sum.cpp new file mode 100644 index 00000000000..56caa138714 --- /dev/null +++ b/src/gpu/nvidia/cudnn_sum.cpp @@ -0,0 +1,41 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "gpu/nvidia/cudnn_sum.hpp" +#include "gpu/nvidia/sycl_cuda_engine.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +namespace { + +using spd_create_f = dnnl::impl::engine_t::sum_primitive_desc_create_f; + +const spd_create_f cuda_sum_impl_list[] + = {cudnn_ref_sum_t::pd_t::create, nullptr}; +} // namespace + +const spd_create_f *cuda_gpu_engine_impl_list_t::get_sum_implementation_list() { + return cuda_sum_impl_list; +} + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl diff --git a/src/gpu/nvidia/cudnn_sum.hpp b/src/gpu/nvidia/cudnn_sum.hpp new file mode 100644 index 00000000000..c93ca81ecfc --- /dev/null +++ b/src/gpu/nvidia/cudnn_sum.hpp @@ -0,0 +1,70 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_NVIDIA_CUDNN_SUM_HPP +#define GPU_NVIDIA_CUDNN_SUM_HPP +#include "gpu/nvidia/sycl_cuda_engine.hpp" +#include "gpu/nvidia/sycl_cuda_utils.hpp" +#include "gpu/ocl/ref_sum.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +struct cudnn_ref_sum_t : public ::dnnl::impl::gpu::ocl::ref_sum_t { + + using base_t = dnnl::impl::gpu::ocl::ref_sum_t; + using base_t::base_t; + using base_pd_t = base_t::pd_t; + + struct pd_t : public base_pd_t { + + using base_pd_t::base_pd_t; + + DECLARE_SUM_PD_T("ref:any", cudnn_ref_sum_t); + // This function can be used for backend that does not support + // blocking on f32, so it can convert the blocked format to nchw. Since + // the final destination will preserve the blocking, the last reorder + // to put the accumulated result to the final output will add the + // blocking back. + void define_dst_acc_md() override { + dst_acc_md_ = dst_md_; + dst_acc_md_.data_type = dnnl_f32; + if ((dst_md_.data_type == data_type::s8) + && (memory_desc_matches_nchw_vect_c(&dst_md_))) { + dst_acc_md_.format_desc.blocking.inner_nblks = 0; + dst_acc_md_.format_desc.blocking.inner_idxs[0] = 0; + dst_acc_md_.format_desc.blocking.inner_blks[0] = 0; + dst_acc_md_.format_desc.blocking.strides[dst_acc_md_.ndims - 1] + = 1; + for (int i = dst_acc_md_.ndims - 2; i >= 0; i--) { + dst_acc_md_.format_desc.blocking.strides[i] + = dst_acc_md_.format_desc.blocking.strides[i + 1] + * dst_acc_md_.dims[i + 1]; + } + } + } + }; +}; + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/nvidia/sycl_cuda_engine.cpp b/src/gpu/nvidia/sycl_cuda_engine.cpp new file mode 100644 index 00000000000..a7f16cc3b74 --- /dev/null +++ b/src/gpu/nvidia/sycl_cuda_engine.cpp @@ -0,0 +1,199 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include + +#include "sycl/sycl_utils.hpp" + +#include "gpu/nvidia/cudnn_batch_normalization.hpp" +#include "gpu/nvidia/cudnn_binary.hpp" +#include "gpu/nvidia/cudnn_conv_inner_product.hpp" +#include "gpu/nvidia/cudnn_convolution.hpp" +#include "gpu/nvidia/cudnn_deconvolution.hpp" +#include "gpu/nvidia/cudnn_eltwise.hpp" +#include "gpu/nvidia/cudnn_gemm_inner_product.hpp" +#include "gpu/nvidia/cudnn_lrn.hpp" +#include "gpu/nvidia/cudnn_matmul.hpp" +#include "gpu/nvidia/cudnn_pooling.hpp" +#include "gpu/nvidia/cudnn_resampling.hpp" +#include "gpu/nvidia/cudnn_softmax.hpp" +#include "gpu/nvidia/sycl_cuda_engine.hpp" +#include "gpu/nvidia/sycl_cuda_scoped_context.hpp" +#include "gpu/nvidia/sycl_cuda_stream.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +bool is_nvidia_gpu(const cl::sycl::device &dev) { + constexpr int nvidia_vendor_id = 0x10DE; + return dev.is_gpu() + && dev.get_info() + == nvidia_vendor_id; +} + +status_t cuda_engine_create(engine_t **engine, engine_kind_t engine_kind, + const cl::sycl::device &dev, const cl::sycl::context &ctx) { + CHECK(nvidia::check_device(engine_kind)); + std::unique_ptr cuda_engine( + (new nvidia::sycl_cuda_engine_t(dev, ctx))); + if (!cuda_engine) return status::out_of_memory; + + CHECK(cuda_engine->init()); + *engine = cuda_engine.release(); + + return status::success; +} + +sycl_cuda_engine_t::sycl_cuda_engine_t(engine_kind_t kind, + const cl::sycl::device &dev, const cl::sycl::context &ctx) + : base_t(kind, dev, ctx) { + underlying_context_type(); + set_cudnn_handle(); + set_cublas_handle(); +} + +sycl_cuda_engine_t::sycl_cuda_engine_t( + const cl::sycl::device &dev, const cl::sycl::context &ctx) + : sycl_cuda_engine_t(engine_kind::gpu, dev, ctx) { + assert(is_nvidia_gpu(dev)); +} + +status_t sycl_cuda_engine_t::set_cublas_handle() { + // scoped context will make sure the top of the stack context is + // the engine context while creating the cublas handle. + cublasHandle_t handle; + cuda_sycl_scoped_context_handler_t sc(*this); + CHECK(CUBLAS_EXECUTE_FUNC_S(cublasCreate, &handle)); + cublas_handle_.reset(new cublasHandle_t(handle)); + handle = nullptr; + return status::success; +} + +status_t sycl_cuda_engine_t::set_cudnn_handle() { + // scoped context will make sure the top of the stack context is + // the engine context while creating the cublas handle. + cudnnHandle_t handle; + cuda_sycl_scoped_context_handler_t sc(*this); + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnCreate, &handle)); + cudnn_handle_.reset(new cudnnHandle_t(handle)); + handle = nullptr; + return status::success; +} + +CUcontext sycl_cuda_engine_t::get_underlying_context() const { + return cl::sycl::get_native(context()); +} + +status_t sycl_cuda_engine_t::create_stream(stream_t **stream, unsigned flags) { + return sycl_cuda_stream_t::create_stream(stream, this, flags); +} + +status_t sycl_cuda_engine_t::create_stream( + stream_t **stream, cl::sycl::queue &queue) { + return sycl_cuda_stream_t::create_stream(stream, this, queue); +} + +status_t sycl_cuda_engine_t::underlying_context_type() { + // this is a costly function which take avarage up to 75ms + // on titanrx. So we must run it once and store the variable + // in is_primary_context_; + CUcontext primary; + CUcontext desired + = cl::sycl::get_native(context()); + CUdevice cuda_device + = cl::sycl::get_native(device()); + CHECK(CUDA_EXECUTE_FUNC_S(cuDevicePrimaryCtxRetain, &primary, cuda_device)); + CHECK(CUDA_EXECUTE_FUNC_S(cuDevicePrimaryCtxRelease, cuda_device)); + primary_context_ = (primary == desired); + return status::success; +} + +device_id_t sycl_cuda_engine_t::device_id() const { + return device_id_t(static_cast(sycl::backend_t::nvidia), + static_cast( + cl::sycl::get_native(device())), + static_cast(0)); +} + +namespace { +using namespace dnnl::impl::data_type; +#define INSTANCE(...) &primitive_desc_t::create<__VA_ARGS__::pd_t> +// clang-format off +const dnnl::impl::engine_t::primitive_desc_create_f sycl_cuda_impl_list[] = { + // Elementwise + INSTANCE(cudnn_eltwise_fwd_t), + INSTANCE(cudnn_eltwise_bwd_t), + + // Deconvolution + INSTANCE(cudnn_deconvolution_fwd_t), + INSTANCE(cudnn_deconvolution_bwd_data_t), + INSTANCE(cudnn_deconvolution_bwd_weights_t), + + // Convolution + INSTANCE(cudnn_convolution_fwd_t), + INSTANCE(cudnn_convolution_bwd_data_t), + INSTANCE(cudnn_convolution_bwd_weights_t), + + // Batch Normalization + INSTANCE(cudnn_batch_normalization_fwd_t), + INSTANCE(cudnn_batch_normalization_bwd_t), + + // Pooling + INSTANCE(cudnn_pooling_fwd_t), + INSTANCE(cudnn_pooling_bwd_t), + + // LRN + INSTANCE(cudnn_lrn_fwd_t), + INSTANCE(cudnn_lrn_bwd_t), + + // Inner Product + INSTANCE(cudnn_gemm_inner_product_fwd_t), + INSTANCE(cudnn_conv_inner_product_fwd_t), + INSTANCE(cudnn_gemm_inner_product_bwd_data_t), + INSTANCE(cudnn_conv_inner_product_bwd_data_t), + INSTANCE(cudnn_gemm_inner_product_bwd_weights_t), + INSTANCE(cudnn_conv_inner_product_bwd_weights_t), + + // Softmax + INSTANCE(cudnn_softmax_fwd_t), + INSTANCE(cudnn_softmax_bwd_t), + + // Binary + INSTANCE(cudnn_binary_t), + + // MatMul + INSTANCE(cudnn_matmul_t), + + // Resampling + INSTANCE(cudnn_resampling_fwd_t), + INSTANCE(cudnn_resampling_bwd_t), + nullptr, +}; +// clang-format on +#undef INSTANCE +} // namespace +const dnnl::impl::engine_t::primitive_desc_create_f * +sycl_cuda_engine_t::get_implementation_list(const op_desc_t *) const { + return sycl_cuda_impl_list; +} + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl diff --git a/src/gpu/nvidia/sycl_cuda_engine.hpp b/src/gpu/nvidia/sycl_cuda_engine.hpp new file mode 100644 index 00000000000..1f774eaff27 --- /dev/null +++ b/src/gpu/nvidia/sycl_cuda_engine.hpp @@ -0,0 +1,121 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_NVIDIA_SYCL_CUDA_ENGINE_HPP +#define GPU_NVIDIA_SYCL_CUDA_ENGINE_HPP + +#include +#include + +#include + +#include "common/stream.hpp" +#include "gpu/nvidia/sycl_cuda_utils.hpp" +#include "sycl/sycl_device_info.hpp" +#include "sycl/sycl_engine_base.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +class cuda_gpu_engine_impl_list_t { +public: + static const dnnl::impl::engine_t::reorder_primitive_desc_create_f * + get_reorder_implementation_list( + const memory_desc_t *src_md, const memory_desc_t *dst_md); + static const dnnl::impl::engine_t::concat_primitive_desc_create_f * + get_concat_implementation_list(); + static const dnnl::impl::engine_t::sum_primitive_desc_create_f * + get_sum_implementation_list(); +}; + +class sycl_cuda_engine_t : public dnnl::impl::sycl::sycl_engine_base_t { +public: + using base_t = dnnl::impl::sycl::sycl_engine_base_t; + + sycl_cuda_engine_t(engine_kind_t kind, const cl::sycl::device &dev, + const cl::sycl::context &ctx); + sycl_cuda_engine_t( + const cl::sycl::device &dev, const cl::sycl::context &ctx); + + status_t create_stream(stream_t **stream, unsigned flags) override; + status_t create_stream(stream_t **stream, cl::sycl::queue &queue); + + const dnnl::impl::engine_t::reorder_primitive_desc_create_f * + get_reorder_implementation_list(const memory_desc_t *src_md, + const memory_desc_t *dst_md) const override { + return cuda_gpu_engine_impl_list_t::get_reorder_implementation_list( + src_md, dst_md); + } + + const dnnl::impl::engine_t::concat_primitive_desc_create_f * + get_concat_implementation_list() const override { + return cuda_gpu_engine_impl_list_t::get_concat_implementation_list(); + } + + const dnnl::impl::engine_t::sum_primitive_desc_create_f * + get_sum_implementation_list() const override { + return cuda_gpu_engine_impl_list_t::get_sum_implementation_list(); + } + + const primitive_desc_create_f *get_implementation_list( + const op_desc_t *) const override; + CUcontext get_underlying_context() const; + cudnnHandle_t *get_cudnn_handle() const { return cudnn_handle_.get(); } + cublasHandle_t *get_cublas_handle() const { return cublas_handle_.get(); } + const bool has_primary_context() const { return primary_context_; } + device_id_t device_id() const override; + +private: + // This functions sets the context type. Since cuda requires different + // approach in retaining/releasing primary/non-primary context. + status_t underlying_context_type(); + status_t set_cudnn_handle(); + status_t set_cublas_handle(); + // To avoid performance penalty cudnn/cublas required to have one handle per + // thread per context therefor the handles will be the properties of the + // engine. an engine can be assigned to multiple streams: lets say engine + // eng(kind, 0); stream str1(eng,...); stream str2(eng,...); stream + // str3(eng,...); In multi-threading environment both engin and stream + // should be created in a different thread in order to allow safe + // multi-threading programming If all the streams belongs to one thread, the + // same handle will be used for all. Creation of handle is expensive and + // must be avoided when it is not necessary. + std::unique_ptr> + cudnn_handle_ {nullptr, [](cudnnHandle_t *h) { + if (h != nullptr) { + CUDNN_EXECUTE_FUNC_V(cudnnDestroy, *h); + h = nullptr; + } + }}; + std::unique_ptr> + cublas_handle_ {nullptr, [](cublasHandle_t *h) { + if (h != nullptr) { + CUBLAS_EXECUTE_FUNC_V(cublasDestroy, *h); + h = nullptr; + } + }}; + bool primary_context_; +}; + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/nvidia/sycl_cuda_scoped_context.cpp b/src/gpu/nvidia/sycl_cuda_scoped_context.cpp new file mode 100644 index 00000000000..d0e8c4ea862 --- /dev/null +++ b/src/gpu/nvidia/sycl_cuda_scoped_context.cpp @@ -0,0 +1,63 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "gpu/nvidia/sycl_cuda_scoped_context.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +cuda_sycl_scoped_context_handler_t::cuda_sycl_scoped_context_handler_t( + const sycl_cuda_engine_t &engine) + : need_to_recover_(false) { + try { + auto desired = engine.get_underlying_context(); + CUDA_EXECUTE_FUNC(cuCtxGetCurrent, &original_); + + if (original_ != desired) { + // Sets the desired context as the active one for the thread + CUDA_EXECUTE_FUNC(cuCtxSetCurrent, desired); + // No context is installed and the suggested context is primary + // This is the most common case. We can activate the context in the + // thread and leave it there until all the PI context referring to + // the same underlying CUDA primary context are destroyed. This + // emulates the behaviour of the CUDA runtime api, and avoids costly + // context switches. No action is required on this side of the if. + need_to_recover_ + = !(original_ == nullptr && engine.has_primary_context()); + } + } catch (const std::runtime_error &e) { + error::wrap_c_api(status::runtime_error, e.what()); + } +} + +cuda_sycl_scoped_context_handler_t:: + ~cuda_sycl_scoped_context_handler_t() noexcept(false) { + // we need to release the placed_context_ since we set it from + // ctx.get() retains the underlying context so we need to remove it + try { + if (need_to_recover_) { CUDA_EXECUTE_FUNC(cuCtxSetCurrent, original_); } + } catch (const std::runtime_error &e) { + error::wrap_c_api(status::runtime_error, e.what()); + } +} + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl diff --git a/src/gpu/nvidia/sycl_cuda_scoped_context.hpp b/src/gpu/nvidia/sycl_cuda_scoped_context.hpp new file mode 100644 index 00000000000..00e01b6c0d1 --- /dev/null +++ b/src/gpu/nvidia/sycl_cuda_scoped_context.hpp @@ -0,0 +1,60 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_NVIDIA_SYCL_CUDA_SCOPED_CONTEXT_HPP +#define GPU_NVIDIA_SYCL_CUDA_SCOPED_CONTEXT_HPP + +#include +#include + +#include +#include + +#include "gpu/nvidia/sycl_cuda_engine.hpp" +#include "gpu/nvidia/sycl_cuda_utils.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +// Scoped context is required to set the current context of a thread +// to the context of the using queue. The scoped handle class is +// required to put the stream context on top of the cuda stack +class cuda_sycl_scoped_context_handler_t { + CUcontext original_; + bool need_to_recover_; + +public: + cuda_sycl_scoped_context_handler_t(const sycl_cuda_engine_t &); + // Destruct the scope p_context placed_context_. + ~cuda_sycl_scoped_context_handler_t() noexcept(false); + + // This is a work-around function for reinterpret_casting the memory. This + // will be fixed when SYCL-2020 has been implemented for Pi backend. + template + inline T memory(const cl::sycl::interop_handler &ih, U acc) { + return reinterpret_cast(ih.get_mem(acc)); + } +}; + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/nvidia/sycl_cuda_stream.cpp b/src/gpu/nvidia/sycl_cuda_stream.cpp new file mode 100644 index 00000000000..64cba983f58 --- /dev/null +++ b/src/gpu/nvidia/sycl_cuda_stream.cpp @@ -0,0 +1,126 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "gpu/nvidia/sycl_cuda_stream.hpp" +#include "gpu/nvidia/sycl_cuda_engine.hpp" +#include "gpu/nvidia/sycl_cuda_scoped_context.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +cublasHandle_t &sycl_cuda_stream_t::get_cublas_handle() { + return *(utils::downcast(engine()) + ->get_cublas_handle()); +} + +cudnnHandle_t &sycl_cuda_stream_t::get_cudnn_handle() { + return *(utils::downcast(engine()) + ->get_cudnn_handle()); +} +// the sycl_cuda_stream_t will not own this. it is an observer pointer +CUstream sycl_cuda_stream_t::get_underlying_stream() { + return cl::sycl::get_native(*queue_); +} + +// the sycl_cuda_stream_t will not own this. it is an observer pointer +CUcontext sycl_cuda_stream_t::get_underlying_context() { + return cl::sycl::get_native(queue_->get_context()); +} + +status_t sycl_cuda_stream_t::init() { + if ((flags() & stream_flags::in_order) == 0 + && (flags() & stream_flags::out_of_order) == 0) + return status::invalid_arguments; + + // If queue_ is not set then construct it + auto &sycl_engine = *utils::downcast(engine()); + auto status = status::success; + + if (!queue_) { + auto &sycl_ctx = sycl_engine.context(); + auto &sycl_dev = sycl_engine.device(); + if (!sycl_engine.is_service_stream_created()) + queue_.reset(new cl::sycl::queue(sycl_ctx, sycl_dev)); + else { + stream_t *service_stream; + CHECK(sycl_engine.get_service_stream(service_stream)); + auto sycl_stream = utils::downcast(service_stream); + queue_.reset(new cl::sycl::queue(sycl_stream->queue())); + } + } else { + auto queue_streamId = get_underlying_stream(); + auto sycl_dev = queue().get_device(); + bool args_ok = IMPLICATION( + engine()->kind() == engine_kind::gpu, sycl_dev.is_gpu()); + if (!sycl_dev.is_gpu()) return status::invalid_arguments; + + auto queue_context = get_underlying_context(); + CUdevice queue_device + = cl::sycl::get_native(sycl_dev); + + auto engine_context = sycl_engine.get_underlying_context(); + auto engine_device = cl::sycl::get_native( + sycl_engine.device()); + + stream_t *service_stream; + CHECK(sycl_engine.get_service_stream(service_stream)); + auto cuda_stream + = utils::downcast(service_stream); + auto engine_streamId = cuda_stream->get_underlying_stream(); + status = ((engine_device != queue_device) + || (engine_context != queue_context) + || (engine_streamId != queue_streamId)) + ? status::invalid_arguments + : status::success; + } + + cuda_sycl_scoped_context_handler_t sc(sycl_engine); + auto streamId = get_underlying_stream(); + auto cublas_handle = sycl_engine.get_cublas_handle(); + auto cudnn_handle = sycl_engine.get_cudnn_handle(); + assert(sycl_engine.context() == base_t::queue().get_context()); + cudaStream_t current_stream_id = nullptr; + CUDNN_EXECUTE_FUNC(cudnnGetStream, *cudnn_handle, ¤t_stream_id); + if (current_stream_id != streamId) { + CUDNN_EXECUTE_FUNC(cudnnSetStream, *cudnn_handle, streamId); + } + + CUBLAS_EXECUTE_FUNC(cublasGetStream, *cublas_handle, ¤t_stream_id); + if (current_stream_id != streamId) { + CUBLAS_EXECUTE_FUNC(cublasSetStream, *cublas_handle, streamId); + } + return status; +} + +status_t sycl_cuda_stream_t::interop_task( + std::function sycl_cuda_interop_) { + try { + this->set_deps({queue().submit( + [&](cl::sycl::handler &cgh) { sycl_cuda_interop_(cgh); })}); + return status::success; + } catch (std::runtime_error &e) { + error::wrap_c_api(status::runtime_error, e.what()); + return status::runtime_error; + } +} + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl diff --git a/src/gpu/nvidia/sycl_cuda_stream.hpp b/src/gpu/nvidia/sycl_cuda_stream.hpp new file mode 100644 index 00000000000..5eb60e5b086 --- /dev/null +++ b/src/gpu/nvidia/sycl_cuda_stream.hpp @@ -0,0 +1,81 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_NVIDIA_SYCL_CUDA_STREAM_HPP +#define GPU_NVIDIA_SYCL_CUDA_STREAM_HPP + +#include +#include +#include + +#include "common/engine.hpp" +#include "sycl/sycl_stream.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +class sycl_cuda_stream_t : public dnnl::impl::sycl::sycl_stream_t { +public: + using base_t = dnnl::impl::sycl::sycl_stream_t; + cublasHandle_t &get_cublas_handle(); + cudnnHandle_t &get_cudnn_handle(); + + static status_t create_stream( + stream_t **stream, engine_t *engine, unsigned flags) { + std::unique_ptr sycl_stream( + new sycl_cuda_stream_t(engine, flags)); + if (!sycl_stream) return status::out_of_memory; + + CHECK(sycl_stream->init()); + *stream = sycl_stream.release(); + return status::success; + } + + static status_t create_stream( + stream_t **stream, engine_t *engine, cl::sycl::queue &queue) { + unsigned flags; + CHECK(base_t::init_flags(&flags, queue)); + + std::unique_ptr sycl_stream( + new sycl_cuda_stream_t(engine, flags, queue)); + + CHECK(sycl_stream->init()); + + *stream = sycl_stream.release(); + return status::success; + } + + status_t interop_task(std::function); + CUstream get_underlying_stream(); + CUcontext get_underlying_context(); + +private: + status_t init(); + sycl_cuda_stream_t(engine_t *engine, unsigned flags, cl::sycl::queue &queue) + : base_t(engine, flags, queue) {} + sycl_cuda_stream_t(engine_t *engine, unsigned flags) + : base_t(engine, flags) {} +}; + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/nvidia/sycl_cuda_utils.hpp b/src/gpu/nvidia/sycl_cuda_utils.hpp new file mode 100644 index 00000000000..66ae05fd0c3 --- /dev/null +++ b/src/gpu/nvidia/sycl_cuda_utils.hpp @@ -0,0 +1,522 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* Copyright 2020 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_NVIDIA_SYCL_CUDA_UTILS_HPP +#define GPU_NVIDIA_SYCL_CUDA_UTILS_HPP + +#include +#include +#include +#include + +#include "dnnl_sycl.hpp" + +#include "common/engine.hpp" +#include "common/z_magic.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { + +#define CTX_OUT_ACCESSOR(arg) \ + utils::downcast( \ + &CTX_OUT_STORAGE(arg)) \ + ->buffer() \ + .get_access(cgh) + +#define CTX_IN_ACCESSOR(arg) \ + utils::downcast( \ + &CTX_IN_STORAGE(arg)) \ + ->buffer() \ + .get_access(cgh) + +#define CTX_SCRATCH_ACCESSOR(arg) \ + utils::downcast( \ + ctx.get_scratchpad_grantor().get_memory_storage(arg).get()) \ + ->buffer() \ + .get_access(cgh) + +// Check if the device type matches the passed engine kind +inline status_t check_device(dnnl::impl::engine_kind_t eng_kind) { + return (eng_kind == dnnl::impl::engine_kind::gpu + ? status::success + : status::invalid_arguments); +} + +static void convert_dnnl_dims_array( + const dnnl_dim_t *dims, int *new_dims, int n_dims) { + for (size_t i = 0; i < n_dims; i++) { + new_dims[i] = static_cast(dims[i]); + } +} + +static void convert_dims(const dnnl_dim_t *dims, int *new_dims, int n_dims, + int adjustment_size = 4, int adjustment_value = 1) { + convert_dnnl_dims_array(dims, new_dims, n_dims); + for (size_t i = n_dims; i < adjustment_size; i++) { + new_dims[i] = adjustment_value; + } +} +static bool memory_desc_matches_nchw_vect_c(const memory_desc_t *mem_desc) { + // Only one block is supported for second (C) dimension and the block size + // must be 4 and the dimension has to be a multiple of block size. + auto is_int_8 = utils::one_of(mem_desc->data_type, data_type::s8); + auto &strides = mem_desc->format_desc.blocking.strides; + if (is_int_8 && mem_desc->format_desc.blocking.inner_nblks == 1 + && mem_desc->format_desc.blocking.inner_idxs[0] == 1 + && mem_desc->format_desc.blocking.inner_blks[0] == 4 + && mem_desc->dims[1] % 4 == 0) { + for (int d = 0; d < mem_desc->ndims - 1; ++d) + if (strides[d] < strides[d + 1]) return false; + return true; + } + return false; +} + +static bool has_different_block_size( + const memory_desc_t *src_md, const memory_desc_t *dst_md) { + return ((src_md->format_desc.blocking.inner_nblks > 0 + && dst_md->format_desc.blocking.inner_nblks == 0) + || (src_md->format_desc.blocking.inner_nblks == 0 + && dst_md->format_desc.blocking.inner_nblks > 0)); +} +static bool adjust_dim_for_dnn( + int *dims, int n_dims, const memory_desc_t *mem_desc) { + if (memory_desc_matches_nchw_vect_c(mem_desc)) { + dims[n_dims] = mem_desc->format_desc.blocking.inner_blks[0]; + dims[mem_desc->format_desc.blocking.inner_idxs[0]] + /= mem_desc->format_desc.blocking.inner_blks[0]; + return true; + } + return false; +} + +static bool adjust_stride_for_dnn( + int *stride, int n_dims, const memory_desc_t *mem_desc) { + if (memory_desc_matches_nchw_vect_c(mem_desc)) { + stride[n_dims] = mem_desc->format_desc.blocking.inner_nblks; + return true; + } + return false; +} + +// Check if the dimensions contain any zeros, returns true if they do. +static bool has_zero_dims(const dnnl_dim_t *dims, int n_dims) { + for (size_t i = 0; i < n_dims; i++) { + if (dims[i] == 0) { return true; } + } + return false; +} + +static status_t get_format(const memory_desc_t *md, cudnnTensorFormat_t &format, + bool consider_ab_as_nhwc = false) { + const memory_desc_wrapper mem_wrapper(md); + if (memory_desc_matches_nchw_vect_c(md)) { + format = cudnnTensorFormat_t::CUDNN_TENSOR_NCHW_VECT_C; + } else if (mem_wrapper.matches_one_of_tag(format_tag::ab, format_tag::abc, + format_tag::abcd, format_tag::abcde, + format_tag::abcdef)) { + format = cudnnTensorFormat_t::CUDNN_TENSOR_NCHW; + } else if (mem_wrapper.matches_one_of_tag( + format_tag::acb, format_tag::acdb, format_tag::acdeb)) { + format = cudnnTensorFormat_t::CUDNN_TENSOR_NHWC; + } else { + return status::unimplemented; + } + if (consider_ab_as_nhwc && mem_wrapper.matches_one_of_tag(format_tag::ab)) { + format = cudnnTensorFormat_t::CUDNN_TENSOR_NHWC; + } + return status::success; +} + +static bool memory_format_ok(const memory_desc_t *mem_desc) { + return (memory_desc_matches_nchw_vect_c(mem_desc) + || mem_desc->format_desc.blocking.inner_nblks == 0); +} + +static status_t convert_data_type(const memory_desc_t *mem_desc, + cudnnDataType_t *cudnn_data_type, bool vectorized = true) { + switch (mem_desc->data_type) { + case dnnl_data_type_t::dnnl_f16: + *cudnn_data_type = cudnnDataType_t::CUDNN_DATA_HALF; + break; + case dnnl_data_type_t::dnnl_f32: + *cudnn_data_type = cudnnDataType_t::CUDNN_DATA_FLOAT; + break; + // CUDNN_TENSOR_NCHW_VECT_C format is only supported with tensor + // data types CUDNN_DATA_INT8x4, CUDNN_DATA_INT8x32, and + // CUDNN_DATA_UINT8x4. oneDNN does not support UINT8 and block size + // of 32, hence the only valid case is CUDNN_DATA_INT8x4 + case dnnl_data_type_t::dnnl_s8: + *cudnn_data_type + = ((vectorized + && mem_desc->format_desc.blocking.inner_blks[0] + == 4) + ? cudnnDataType_t::CUDNN_DATA_INT8x4 + : cudnnDataType_t::CUDNN_DATA_INT8); + break; + default: return status::unimplemented; + } + return status::success; +} + +class cublas_error : virtual public std::runtime_error { + +protected: + const char *cublas_error_map(cublasStatus_t error) { + switch (error) { + case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS"; + + case CUBLAS_STATUS_NOT_INITIALIZED: + return "CUBLAS_STATUS_NOT_INITIALIZED"; + + case CUBLAS_STATUS_ALLOC_FAILED: + return "CUBLAS_STATUS_ALLOC_FAILED"; + + case CUBLAS_STATUS_INVALID_VALUE: + return "CUBLAS_STATUS_INVALID_VALUE"; + + case CUBLAS_STATUS_ARCH_MISMATCH: + return "CUBLAS_STATUS_ARCH_MISMATCH"; + + case CUBLAS_STATUS_MAPPING_ERROR: + return "CUBLAS_STATUS_MAPPING_ERROR"; + + case CUBLAS_STATUS_EXECUTION_FAILED: + return "CUBLAS_STATUS_EXECUTION_FAILED"; + + case CUBLAS_STATUS_INTERNAL_ERROR: + return "CUBLAS_STATUS_INTERNAL_ERROR"; + + case CUBLAS_STATUS_NOT_SUPPORTED: + return "CUBLAS_STATUS_NOT_SUPPORTED"; + + case CUBLAS_STATUS_LICENSE_ERROR: + return "CUBLAS_STATUS_LICENSE_ERROR"; + + default: return ""; + } + } + + int error_number_; + +public: + explicit cublas_error(const std::string &message, cublasStatus_t result) + : std::runtime_error( + (message + std::string(cublas_error_map(result)))) { + error_number_ = static_cast(result); + } + + virtual ~cublas_error() throw() {} + + virtual int get_error_number() const throw() { return error_number_; } +}; + +class cuda_error : virtual public std::runtime_error { + +protected: + inline const char *cuda_error_map(CUresult result) { + switch (result) { + case CUDA_SUCCESS: return "CUDA_SUCCESS"; + case CUDA_ERROR_NOT_PERMITTED: return "CUDA_ERROR_NOT_PERMITTED"; + case CUDA_ERROR_INVALID_CONTEXT: + return "CUDA_ERROR_INVALID_CONTEXT"; + case CUDA_ERROR_INVALID_DEVICE: return "CUDA_ERROR_INVALID_DEVICE"; + case CUDA_ERROR_INVALID_VALUE: return "CUDA_ERROR_INVALID_VALUE"; + case CUDA_ERROR_OUT_OF_MEMORY: return "CUDA_ERROR_OUT_OF_MEMORY"; + case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: + return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES"; + default: return ""; + } + } + int error_number_; + +public: + explicit cuda_error(const std::string &message, CUresult result) + : std::runtime_error((message + std::string(cuda_error_map(result)))) { + error_number_ = static_cast(result); + } + + explicit cuda_error(const std::string &message, cudaError_t result) + : std::runtime_error( + (message + std::to_string(static_cast(result)))) { + error_number_ = static_cast(result); + } + virtual ~cuda_error() throw() {} + + virtual int get_error_number() const throw() { return error_number_; } +}; + +class cudnn_error : virtual public std::runtime_error { + +protected: + inline const char *cudnn_get_error_string(cudnnStatus_t status) { + switch (status) { + case CUDNN_STATUS_SUCCESS: return "CUDNN_STATUS_SUCCESS"; + case CUDNN_STATUS_NOT_INITIALIZED: + return "CUDNN_STATUS_NOT_INITIALIZED"; + case CUDNN_STATUS_ALLOC_FAILED: return "CUDNN_STATUS_ALLOC_FAILED"; + case CUDNN_STATUS_BAD_PARAM: return "CUDNN_STATUS_BAD_PARAM"; + case CUDNN_STATUS_INTERNAL_ERROR: + return "CUDNN_STATUS_INTERNAL_ERROR"; + case CUDNN_STATUS_INVALID_VALUE: + return "CUDNN_STATUS_INVALID_VALUE"; + case CUDNN_STATUS_ARCH_MISMATCH: + return "CUDNN_STATUS_ARCH_MISMATCH"; + case CUDNN_STATUS_MAPPING_ERROR: + return "CUDNN_STATUS_MAPPING_ERROR"; + case CUDNN_STATUS_EXECUTION_FAILED: + return "CUDNN_STATUS_EXECUTION_FAILED"; + case CUDNN_STATUS_NOT_SUPPORTED: + return "CUDNN_STATUS_NOT_SUPPORTED"; + case CUDNN_STATUS_LICENSE_ERROR: + return "CUDNN_STATUS_LICENSE_ERROR"; + case CUDNN_STATUS_RUNTIME_IN_PROGRESS: + return "CUDNN_STATUS_RUNTIME_IN_PROGRESS"; + case CUDNN_STATUS_RUNTIME_FP_OVERFLOW: + return "CUDNN_STATUS_RUNTIME_FP_OVERFLOW"; + default: return ""; + } + } + int error_number_; + +public: + explicit cudnn_error(const std::string &message, cudnnStatus_t result) + : std::runtime_error( + (message + std::string(cudnn_get_error_string(result)))) { + error_number_ = static_cast(result); + } + + virtual ~cudnn_error() throw() {} + + virtual int get_error_number() const throw() { return error_number_; } +}; + +template +cl::sycl::event copy(cl::sycl::queue &q, T *src, cl::sycl::buffer &dst) { + + auto event = q.submit([&, src](cl::sycl::handler &cgh) { + // Retrieve a write accessor to a global buffer + auto acc = dst.template get_access(cgh); + // Copy from the input pointer into the buffer associated with the + // accessor + cgh.copy(src, acc); + }); + return event; +} + +template +cl::sycl::event copy(cl::sycl::queue &q, cl::sycl::buffer &src, T *dst) { + + auto event = q.submit([&, dst](cl::sycl::handler &cgh) { + // Retrieve a read accessor to a global buffer + auto acc = src.template get_access(cgh); + // Copy from the buffer associated with the accessor into the output + // pointer + cgh.copy(acc, dst); + }); + + return event; +} + +template +cl::sycl::event copy(cl::sycl::queue &q, cl::sycl::buffer &src, + cl::sycl::buffer &dst) { + auto event = q.submit([&](cl::sycl::handler &cgh) { + auto src_acc + = src.template get_access( + cgh); + auto dst_acc + = dst.template get_access( + cgh); + cgh.copy(src_acc, dst_acc); + }); + return event; +} + +static status_t cudnn_to_dnnl_status(cudnnStatus_t cu_status) { + switch (cu_status) { + case CUDNN_STATUS_SUCCESS: return status::success; + case CUDNN_STATUS_BAD_PARAM: return status::invalid_arguments; + case CUDNN_STATUS_NOT_SUPPORTED: return status::unimplemented; + default: return status::runtime_error; + } +} + +static status_t cublas_to_dnnl_status(cublasStatus_t cu_status) { + switch (cu_status) { + case CUBLAS_STATUS_SUCCESS: return status::success; + default: return status::runtime_error; + } +} + +static status_t cuda_to_dnnl_status(CUresult cu_result) { + switch (cu_result) { + case CUDNN_STATUS_SUCCESS: return status::success; + default: return status::runtime_error; + } +} + +#define CUDA_ERROR_LOCATION __FILE__ " : " STRINGIFY(__LINE__) + +#define CUDA_EXECUTE_FUNC(name, ...) \ + { \ + auto err = name(__VA_ARGS__); \ + if (err != CUDA_SUCCESS) { \ + throw cuda_error(std::string("At :") \ + + std::string(CUDA_ERROR_LOCATION) \ + + std::string(#name) + std::string(" : "), \ + err); \ + } \ + } + +#define CUBLAS_EXECUTE_FUNC(name, ...) \ + { \ + auto err = name(__VA_ARGS__); \ + if (err != CUBLAS_STATUS_SUCCESS) { \ + throw cublas_error(std::string("At :") \ + + std::string(CUDA_ERROR_LOCATION) \ + + std::string(#name) + std::string(" : "), \ + err); \ + } \ + } + +#define CUDNN_EXECUTE_FUNC(name, ...) \ + { \ + auto err = name(__VA_ARGS__); \ + if (err != CUDNN_STATUS_SUCCESS) { \ + throw cudnn_error(std::string("At :") \ + + std::string(CUDA_ERROR_LOCATION) \ + + std::string(#name) + std::string(" : "), \ + err); \ + } \ + } + +#define CUDA_EXECUTE_FUNC_V(name, ...) \ + { \ + auto err = name(__VA_ARGS__); \ + if (err != CUDA_SUCCESS) { \ + std::cout << cuda_error(std::string("At :") \ + + std::string(CUDA_ERROR_LOCATION) \ + + std::string(#name) + std::string(" : "), \ + err) \ + .what() \ + << std::endl; \ + } \ + } + +#define CUDNN_EXECUTE_FUNC_V(name, ...) \ + { \ + auto err = name(__VA_ARGS__); \ + if (err != CUDNN_STATUS_SUCCESS) { \ + std::cout << cudnn_error(std::string("At :") \ + + std::string(CUDA_ERROR_LOCATION) \ + + std::string(#name) + std::string(" : "), \ + err) \ + .what() \ + << std::endl; \ + } \ + } + +#define CUBLAS_EXECUTE_FUNC_V(name, ...) \ + { \ + auto err = name(__VA_ARGS__); \ + if (err != CUBLAS_STATUS_SUCCESS) { \ + std::cout << cublas_error(std::string("At :") \ + + std::string(CUDA_ERROR_LOCATION) \ + + std::string(#name) + std::string(" : "), \ + err) \ + .what() \ + << std::endl; \ + } \ + } + +#define CUDA_EXECUTE_FUNC_S(name, ...) \ + [&]() { \ + auto err = name(__VA_ARGS__); \ + return cuda_to_dnnl_status(err); \ + }() + +#define CUBLAS_EXECUTE_FUNC_S(name, ...) \ + [&]() { \ + auto err = name(__VA_ARGS__); \ + return cublas_to_dnnl_status(err); \ + }() + +#define CUDNN_EXECUTE_FUNC_S(name, ...) \ + [&]() { \ + auto err = name(__VA_ARGS__); \ + if (err != CUDNN_STATUS_SUCCESS) { return cudnn_to_dnnl_status(err); } \ + return status::success; \ + }() + +static status_t create_and_set_tensor_descriptor( + cudnnTensorDescriptor_t *tensor_desc, cudnnDataType_t data_type, + int ndims, int *dims, int *strides) { + + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnCreateTensorDescriptor, tensor_desc)); + + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetTensorNdDescriptor, *tensor_desc, + data_type, ndims, dims, strides)); + + return status::success; +} + +static status_t create_and_set_tensor_descriptor_ex( + cudnnTensorDescriptor_t *tensor_desc, cudnnTensorFormat_t format, + cudnnDataType_t data_type, int ndims, int *dims) { + + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnCreateTensorDescriptor, tensor_desc)); + + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetTensorNdDescriptorEx, *tensor_desc, + format, data_type, ndims, dims)); + + return status::success; +} + +static status_t create_and_set_filter_descriptor( + cudnnFilterDescriptor_t *filter_desc, cudnnTensorFormat_t format, + cudnnDataType_t data_type, int ndims, int *dims, int *) { + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnCreateFilterDescriptor, filter_desc)); + + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetFilterNdDescriptor, *filter_desc, + data_type, format, ndims, dims)); + + return status::success; +} + +static status_t create_and_set_conv_descriptor( + cudnnConvolutionDescriptor_t *conv_desc, int ndims, int *padding, + int *strides, int *dilation, cudnnConvolutionMode_t mode, + cudnnDataType_t data_type) { + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnCreateConvolutionDescriptor, conv_desc)); + + CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetConvolutionNdDescriptor, *conv_desc, + ndims, padding, strides, dilation, mode, data_type)); + + return status::success; +} + +} // namespace nvidia +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/ocl/ref_sum.hpp b/src/gpu/ocl/ref_sum.hpp index d06543f8288..835ab2ebc74 100644 --- a/src/gpu/ocl/ref_sum.hpp +++ b/src/gpu/ocl/ref_sum.hpp @@ -153,7 +153,9 @@ struct ref_sum_t : public gpu_primitive_t { nested_scratchpad_t ns(ctx, key_nested_multiple + i, reorders_[i]); r_ctx.set_scratchpad_grantor(ns.grantor()); CHECK(reorders_[i]->execute(r_ctx)); +#ifndef DNNL_SYCL_CUDA ctx.stream()->wait(); +#endif } if (pd()->need_output_reorder()) { diff --git a/tests/benchdnn/binary/binary.cpp b/tests/benchdnn/binary/binary.cpp index 9644fc08f09..ebb6fd082f4 100644 --- a/tests/benchdnn/binary/binary.cpp +++ b/tests/benchdnn/binary/binary.cpp @@ -240,6 +240,14 @@ void check_known_skipped_case(const prb_t *prb, res_t *res) { res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; return; } + + if (is_nvidia_gpu()) { + const bool alg_ok = !(prb->alg == alg_t::DIV || prb->alg == alg_t::SUB); + if (!alg_ok || !prb->attr.post_ops.is_def()) { + res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + return; + } + } } int doit(const prb_t *prb, res_t *res) { diff --git a/tests/benchdnn/bnorm/bnorm.cpp b/tests/benchdnn/bnorm/bnorm.cpp index c33ac9acdcb..35d8d984517 100644 --- a/tests/benchdnn/bnorm/bnorm.cpp +++ b/tests/benchdnn/bnorm/bnorm.cpp @@ -218,6 +218,15 @@ static int compare(const prb_t *prb, data_kind_t kind, const dnn_mem_t &fp_mem, float eps = eps_coeff * (kind == DATA ? 5e-7 : 0); if (kind == SS && prb->dir & FLAG_BWD) eps = eps_coeff * 5e-6; + if (is_nvidia_gpu()) { + // cuDNN stores unbiased variance which requires rescaling by + // `(N - 1) / N`, where `N = MB * Spatial`. Hence, we cannot set the + // threshold to 0... + // Also the mean could also be rounded incorrectly (how?!) + if (kind == MEAN) eps = 1e-7; + if (kind == VAR) eps = 4e-7; + } + // Since bwd testing is done using results from forward which are random // fp32 values, diff_ss starts fluctuating, so we check norm for both data // and SS. @@ -457,6 +466,20 @@ int init_pd(dnnl_engine_t engine, const prb_t *prb, dnnl_primitive_desc_t &bpd, void check_known_skipped_case(const prb_t *prb, res_t *res) { check_known_skipped_case_common({prb->dt}, prb->dir, res); + if (res->state == SKIPPED) return; + + if (is_nvidia_gpu()) { + const bool bwd_ok + = !((prb->dir & FLAG_BWD) && (prb->flags & GLOB_STATS)); + const bool inference_ok + = IMPLICATION(prb->dt == dnnl_s8 || prb->dt == dnnl_f16, + (prb->dir & FLAG_INF) && (prb->flags & GLOB_STATS)); + + if (!bwd_ok || !inference_ok) { + res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + return; + } + } } int doit(const prb_t *prb, res_t *res) { diff --git a/tests/benchdnn/conv/conv.cpp b/tests/benchdnn/conv/conv.cpp index b3c36c6e6e4..2b09651ca4f 100644 --- a/tests/benchdnn/conv/conv.cpp +++ b/tests/benchdnn/conv/conv.cpp @@ -696,6 +696,47 @@ void check_known_skipped_case(const prb_t *prb, res_t *res) { return; } } + + if (is_nvidia_gpu()) { + const int64_t ID = prb->id, IH = prb->ih, IW = prb->iw; + const int64_t OD = prb->od, OH = prb->oh, OW = prb->ow; + const int64_t KD = prb->kd, KH = prb->kh, KW = prb->kw; + const int64_t SD = prb->sd, SH = prb->sh, SW = prb->sw; + const int64_t PD = prb->pd, PH = prb->ph, PW = prb->pw; + const int64_t PD_R = prb->pd_r, PH_R = prb->ph_r, PW_R = prb->pw_r; + const bool pad_ok = PD >= PD_R && PH >= PH_R && PW >= PW_R; + // copy-pasted from str2desc, dilation is not supported for Nvidia + const auto compute_out + = [](int64_t i, int64_t k, int64_t s, int64_t p) { + return (i - k + 2 * p) / s + 1; + }; + const bool out_ok = OD == compute_out(ID, KD, SD, PD) + && OH == compute_out(IH, KH, SH, PH) + && OW == compute_out(IW, KW, SW, PW); + + const auto &po = prb->attr.post_ops; + bool post_ops_ok = true; + for (int i = 0; i < po.len(); ++i) { + const auto &e = po.entry[i]; + if (e.is_sum_kind()) + continue; + else if (e.is_eltwise_kind()) + post_ops_ok = post_ops_ok && is_nvidia_eltwise_ok(prb->dir, e); + else if (e.is_binary_kind() || e.is_convolution_kind()) + post_ops_ok = false; + else + assert(!"unknown post-op type"); + } + + const auto dtag = normalize_tag(prb->dtag, prb->ndims); + const bool dtag_is_axb = dtag == normalize_tag(tag::axb, prb->ndims); + const bool tag_ok = !((prb->dir & FLAG_BWD) && dtag_is_axb); + // TODO: specified wtag (even for supported formats) is not working? + if (!pad_ok || !out_ok || !post_ops_ok || !tag_ok) { + res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + return; + } + } } int doit(const prb_t *prb, res_t *res) { diff --git a/tests/benchdnn/conv/deconv.cpp b/tests/benchdnn/conv/deconv.cpp index 95790eb1ada..67247805efe 100644 --- a/tests/benchdnn/conv/deconv.cpp +++ b/tests/benchdnn/conv/deconv.cpp @@ -197,6 +197,49 @@ void check_known_skipped_case(const prb_t *prb, res_t *res) { res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; return; } + + if (is_nvidia_gpu()) { + const int64_t ID = prb->id, IH = prb->ih, IW = prb->iw; + const int64_t OD = prb->od, OH = prb->oh, OW = prb->ow; + const int64_t KD = prb->kd, KH = prb->kh, KW = prb->kw; + const int64_t SD = prb->sd, SH = prb->sh, SW = prb->sw; + const int64_t PD = prb->pd, PH = prb->ph, PW = prb->pw; + const int64_t PD_R = prb->pd_r, PH_R = prb->ph_r, PW_R = prb->pw_r; + const bool pad_ok = PD >= PD_R && PH >= PH_R && PW >= PW_R; + // copy-pasted from str2desc, dilation is not supported for Nvidia + const auto compute_out + = [](int64_t i, int64_t k, int64_t s, int64_t p) { + return (i - 1) * s + k - 2 * p; + }; + const bool out_ok = OD == compute_out(ID, KD, SD, PD) + && OH == compute_out(IH, KH, SH, PH) + && OW == compute_out(IW, KW, SW, PW); + + bool post_ops_ok = prb->attr.post_ops.is_def(); + + const auto stag = normalize_tag(prb->stag, prb->ndims); + const bool stag_is_axb = stag == normalize_tag(tag::axb, prb->ndims); + const bool fwd_tag_ok = !((prb->dir & FLAG_FWD) && stag_is_axb); + const bool bwd_tag_ok + = !((prb->dir == BWD_W || prb->dir == BWD_WB) && stag_is_axb); + const bool tag_ok = fwd_tag_ok && bwd_tag_ok; + // TODO: specified wtag (even for supported formats) is not working? + if (!pad_ok || !out_ok || !post_ops_ok || !tag_ok) { + res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + return; + } + + // FIXME: there's a bug in the library resulting in + // memory_tracking.hpp:458: Assertion `registry_.size() == 0' failed. + // Specifically for 3D spatial case, when both BWD_W and BWD_WB are + // run. It must be cache interaction, but not clear which side is + // guilty. Likely Nvidia implementation. Switch it off until further + // investigation. + if (prb->ndims == 5 && prb->dir == BWD_WB) { + res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + return; + } + } } int doit(const prb_t *prb, res_t *res) { diff --git a/tests/benchdnn/dnnl_common.cpp b/tests/benchdnn/dnnl_common.cpp index 149ffcf35a8..d4158cba429 100644 --- a/tests/benchdnn/dnnl_common.cpp +++ b/tests/benchdnn/dnnl_common.cpp @@ -17,6 +17,11 @@ #include #include "oneapi/dnnl/dnnl.h" +// For is_nvidia_gpu(...) +#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_DPCPP +#include "oneapi/dnnl/dnnl_sycl.hpp" +#endif + #include "dnnl_common.hpp" #include "dnnl_memory.hpp" @@ -255,5 +260,45 @@ void check_known_skipped_case_common( r->state = SKIPPED, r->reason = DATA_TYPE_NOT_SUPPORTED; break; } + // cuda supports only f32, f16 and s8 data types + if (is_nvidia_gpu() + && (i_dt == dnnl_bf16 || i_dt == dnnl_u8 || i_dt == dnnl_s32)) { + r->state = SKIPPED, r->reason = DATA_TYPE_NOT_SUPPORTED; + break; + } } } + +bool is_nvidia_gpu(const engine_t &engine) { + dnnl_engine_kind_t engine_kind = dnnl_any_engine; + DNN_SAFE_V(dnnl_engine_get_kind(engine, &engine_kind)); + + if (engine_kind != dnnl_gpu) return false; +#if DNNL_WITH_SYCL + constexpr int nvidia_vendor_id = 0x10DE; + auto eng = dnnl::engine(engine, true); + auto device = dnnl::sycl_interop::get_device(eng); + const auto eng_vendor_id + = device.get_info(); + return eng_vendor_id == nvidia_vendor_id; +#endif + return false; +} + +bool is_nvidia_eltwise_ok( + dir_t dir, attr_t::post_ops_t::kind_t alg, float alpha) { + using pk_t = attr_t::post_ops_t::kind_t; + switch (alg) { + case pk_t::BRELU: return true; + case pk_t::ELU: return (dir & FLAG_FWD); + case pk_t::LOGISTIC: return (dir & FLAG_FWD); + case pk_t::TANH: return (dir & FLAG_FWD); + case pk_t::RELU: return alpha == 0.f; + // TODO: can be easily supported by Nvidia backend + // case pk_t::ELU_DST: return true; + // case pk_t::LOGISTIC_DST: return true; + // case pk_t::TANH_DST: return true; + // case pk_t::RELU_DST: return alpha == 0.f; + default: return false; + }; +} diff --git a/tests/benchdnn/dnnl_common.hpp b/tests/benchdnn/dnnl_common.hpp index ecdfc505604..41f026c2bf6 100644 --- a/tests/benchdnn/dnnl_common.hpp +++ b/tests/benchdnn/dnnl_common.hpp @@ -320,4 +320,12 @@ bool check_md_consistency_with_tag( void check_known_skipped_case_common( const std::vector &v_dt, dir_t dir, res_t *r); +bool is_nvidia_gpu(const engine_t &engine = get_test_engine()); +bool is_nvidia_eltwise_ok( + dir_t dir, attr_t::post_ops_t::kind_t alg, float alpha); +inline bool is_nvidia_eltwise_ok( + dir_t dir, const attr_t::post_ops_t::entry_t &e) { + return is_nvidia_eltwise_ok(dir, e.kind, e.eltwise.alpha); +} + #endif diff --git a/tests/benchdnn/dnnl_memory.hpp b/tests/benchdnn/dnnl_memory.hpp index 3aa6493b9cf..cda5e588e26 100644 --- a/tests/benchdnn/dnnl_memory.hpp +++ b/tests/benchdnn/dnnl_memory.hpp @@ -238,7 +238,22 @@ struct dnn_mem_t { } else { is_data_owner_ = false; data_ = NULL; + +#if DNNL_WITH_SYCL + // XXX: A hack to mitigate the issue from create_from_host_ptr when + // perform a CPU reorder due to USM in not supported on Nvidia, but + // it's not allowed to convert host_ptr to SYCL buffer. + engine_t e(engine_kind_); + if (is_nvidia_gpu(e)) { + DNN_SAFE(dnnl_sycl_interop_memory_create(&m_, &md_, engine, + dnnl_sycl_interop_buffer, handle), + CRIT); + } else { + DNN_SAFE(dnnl_memory_create(&m_, &md_, engine, handle), CRIT); + } +#else DNN_SAFE(dnnl_memory_create(&m_, &md_, engine, handle), CRIT); +#endif } if (handle == DNNL_MEMORY_ALLOCATE) { diff --git a/tests/benchdnn/eltwise/eltwise.cpp b/tests/benchdnn/eltwise/eltwise.cpp index 5d29c584c87..4eeca612c05 100644 --- a/tests/benchdnn/eltwise/eltwise.cpp +++ b/tests/benchdnn/eltwise/eltwise.cpp @@ -19,7 +19,7 @@ #include #include -#include "dnnl.h" +#include "oneapi/dnnl/dnnl.h" #include "tests/test_thread.hpp" @@ -341,6 +341,14 @@ void check_known_skipped_case(const prb_t *prb, res_t *res) { res->state = SKIPPED, res->reason = INVALID_CASE; return; } + + if (is_nvidia_gpu()) { + if (!is_nvidia_eltwise_ok(prb->dir, prb->alg, prb->alpha) + || !prb->attr.post_ops.is_def()) { + res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + return; + } + } } int doit(const prb_t *prb, res_t *res) { diff --git a/tests/benchdnn/inputs/resampling/test_resampling_all b/tests/benchdnn/inputs/resampling/test_resampling_all index a5e279227f0..4f9472bef26 100644 --- a/tests/benchdnn/inputs/resampling/test_resampling_all +++ b/tests/benchdnn/inputs/resampling/test_resampling_all @@ -12,4 +12,3 @@ # bf16 --batch=test_resampling_bfloat16 - diff --git a/tests/benchdnn/ip/ip.cpp b/tests/benchdnn/ip/ip.cpp index b6c22df04d9..a903905a133 100644 --- a/tests/benchdnn/ip/ip.cpp +++ b/tests/benchdnn/ip/ip.cpp @@ -304,6 +304,29 @@ void check_known_skipped_case(const prb_t *prb, res_t *res) { {prb->cfg[SRC].dt, prb->cfg[WEI].dt, prb->cfg[DST].dt}, prb->dir, res); if (res->state == SKIPPED) return; + + if (is_nvidia_gpu()) { + const auto &po = prb->attr.post_ops; + bool post_ops_ok = true; + for (int i = 0; i < po.len(); ++i) { + const auto &e = po.entry[i]; + if (e.is_sum_kind()) + continue; + else if (e.is_eltwise_kind()) + post_ops_ok = post_ops_ok && is_nvidia_eltwise_ok(prb->dir, e); + else if (e.is_binary_kind() || e.is_convolution_kind()) + post_ops_ok = false; + else + assert(!"unknown post-op type"); + } + + const bool oscale_ok = prb->attr.oscale.policy == policy_t::COMMON; + + if (!post_ops_ok || !oscale_ok) { + res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + return; + } + } } int doit(const prb_t *prb, res_t *res) { diff --git a/tests/benchdnn/lnorm/lnorm.cpp b/tests/benchdnn/lnorm/lnorm.cpp index bc8bf571dd1..c570dfe68e7 100644 --- a/tests/benchdnn/lnorm/lnorm.cpp +++ b/tests/benchdnn/lnorm/lnorm.cpp @@ -470,6 +470,12 @@ static int init_pd(dnnl_engine_t engine, const prb_t *prb, void check_known_skipped_case(const prb_t *prb, res_t *res) { check_known_skipped_case_common({prb->dt}, prb->dir, res); + if (res->state == SKIPPED) return; + + if (is_nvidia_gpu()) { + res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + return; + } } int doit(const prb_t *prb, res_t *res) { diff --git a/tests/benchdnn/lrn/lrn.cpp b/tests/benchdnn/lrn/lrn.cpp index 52d785a70a8..63157baa330 100644 --- a/tests/benchdnn/lrn/lrn.cpp +++ b/tests/benchdnn/lrn/lrn.cpp @@ -167,6 +167,14 @@ static int init_pd(dnnl_engine_t engine, const prb_t *prb, void check_known_skipped_case(const prb_t *prb, res_t *res) { check_known_skipped_case_common({prb->dt}, prb->dir, res); + if (res->state == SKIPPED) return; + + if (is_nvidia_gpu()) { + if (prb->alg != ACROSS || prb->ls % 2 != 1) { + res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + return; + } + } } int doit(const prb_t *prb, res_t *res) { diff --git a/tests/benchdnn/matmul/matmul.cpp b/tests/benchdnn/matmul/matmul.cpp index a323e62905d..69fb11b936d 100644 --- a/tests/benchdnn/matmul/matmul.cpp +++ b/tests/benchdnn/matmul/matmul.cpp @@ -290,6 +290,31 @@ void check_known_skipped_case(const prb_t *prb, res_t *res) { return; } } + + if (is_nvidia_gpu()) { + const auto &po = prb->attr.post_ops; + bool post_ops_ok = true; + for (int i = 0; i < po.len(); ++i) { + const auto &e = po.entry[i]; + if (e.is_sum_kind()) + continue; + else if (e.is_eltwise_kind()) + post_ops_ok = post_ops_ok && is_nvidia_eltwise_ok(FLAG_FWD, e); + else if (e.is_binary_kind() || e.is_convolution_kind()) + post_ops_ok = false; + else + assert(!"unknown post-op type"); + } + + const bool oscale_ok = prb->attr.oscale.policy == policy_t::COMMON; + + const bool zp_ok = prb->attr.zero_points.is_def(); + + if (!post_ops_ok || !oscale_ok || !zp_ok) { + res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + return; + } + } } int doit(const prb_t *prb, res_t *res) { diff --git a/tests/benchdnn/pool/pool.cpp b/tests/benchdnn/pool/pool.cpp index 94cd8b67042..63d0adf8cae 100644 --- a/tests/benchdnn/pool/pool.cpp +++ b/tests/benchdnn/pool/pool.cpp @@ -59,6 +59,12 @@ inline int compare_dat(const prb_t *prb, data_kind_t kind, dnn_mem_t &mem_dt, else ok = (fabs(fp) > 1e-5 ? rel_diff : diff) <= prb->cfg[kind].eps; + // XXX: bug in cuDNN: it spits fp16 min value as -inf, not -65504 + if (!ok && is_nvidia_gpu() && prb->cfg[kind].dt == dnnl_f16) { + ok = fp == lowest_dt(prb->cfg[kind].dt) && std::isinf(dt) + && std::signbit(dt); + } + res->errors += !ok; bool dump = (!ok && (res->errors < 10 || verbose >= 10)) @@ -258,6 +264,23 @@ void check_known_skipped_case(const prb_t *prb, res_t *res) { res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; return; } + + if (is_nvidia_gpu()) { + const int64_t PD = prb->pd, PH = prb->ph, PW = prb->pw; + const int64_t PD_R = prb->pd_r, PH_R = prb->ph_r, PW_R = prb->pw_r; + const bool pad_ok + = !(prb->alg == AVG_P && (PD < PD_R || PH < PH_R || PW < PW_R)); + + const int64_t DD = prb->dd, DH = prb->dh, DW = prb->dw; + const bool dilation_ok = DD == 0 && DH == 0 && DW == 0; + + const bool post_ops_ok = prb->attr.post_ops.is_def(); + + if (!pad_ok || !dilation_ok || !post_ops_ok) { + res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + return; + } + } } int doit(const prb_t *prb, res_t *res) { diff --git a/tests/benchdnn/reduction/reduction.cpp b/tests/benchdnn/reduction/reduction.cpp index 27037d0de1e..6562b88555e 100644 --- a/tests/benchdnn/reduction/reduction.cpp +++ b/tests/benchdnn/reduction/reduction.cpp @@ -191,6 +191,11 @@ void check_known_skipped_case(const prb_t *prb, res_t *res) { res->state = SKIPPED, res->reason = INVALID_CASE; return; } + + if (is_nvidia_gpu()) { + res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + return; + } } int doit(const prb_t *prb, res_t *res) { diff --git a/tests/benchdnn/reorder/reorder.cpp b/tests/benchdnn/reorder/reorder.cpp index 375b52e0a22..45dc028622c 100644 --- a/tests/benchdnn/reorder/reorder.cpp +++ b/tests/benchdnn/reorder/reorder.cpp @@ -318,6 +318,14 @@ void check_known_skipped_case(const prb_t *prb, res_t *res) { return; } } + + if (is_nvidia_gpu()) { + const bool oscale_ok = prb->attr.oscale.policy == policy_t::COMMON; + if (!oscale_ok) { + res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + return; + } + } } int doit(const prb_t *prb, res_t *res) { diff --git a/tests/benchdnn/resampling/resampling.cpp b/tests/benchdnn/resampling/resampling.cpp index 5475529a25b..0750905e758 100644 --- a/tests/benchdnn/resampling/resampling.cpp +++ b/tests/benchdnn/resampling/resampling.cpp @@ -39,6 +39,7 @@ inline int compare_dat(const prb_t *prb, data_kind_t kind, dnn_mem_t &mem_dt, res->total = nelems; float trh = 0; + float eps = 1e-5; if (prb->alg == nearest) { // On forward, `dst` consists of exact `src` elements, hence the result // shall be exact (no matter what data type is). On backward, the @@ -54,6 +55,12 @@ inline int compare_dat(const prb_t *prb, data_kind_t kind, dnn_mem_t &mem_dt, } else { assert(prb->alg == linear); trh = prb->dt == dnnl_f32 ? 1e-6 : 1e-2; + if (is_nvidia_gpu()) { + // cuDNN precision is different from ref one due to different + // computation algorithm used for resampling. + trh = prb->dt == dnnl_f16 ? 4e-1 : 8e-4; + eps = prb->dt == dnnl_f16 ? 1e-1 : 8e-5; + } } for (int64_t i = 0; i < nelems; ++i) { @@ -63,7 +70,7 @@ inline int compare_dat(const prb_t *prb, data_kind_t kind, dnn_mem_t &mem_dt, const float diff = fabsf(fp - dt); const float rel_diff = diff / (fabsf(fp) > FLT_MIN ? fabsf(fp) : 1); - const bool ok = (fabsf(fp) > 1e-5 ? rel_diff : diff) <= trh; + const bool ok = (fabsf(fp) > eps ? rel_diff : diff) <= trh; res->errors += !ok; @@ -150,7 +157,7 @@ static int init_pd(dnnl_engine_t engine, const prb_t *prb, : prb->ndims == 4 ? dst_2d_dims : dst_1d_dims; std::string src_tag = (prb->dir & FLAG_FWD) ? prb->tag : tag::any; - std::string dst_tag = tag::any; + std::string dst_tag = (prb->dir & FLAG_BWD) ? prb->tag : tag::any; SAFE(init_md(&src_d, prb->ndims, src_dims, prb->dt, src_tag), CRIT); @@ -219,6 +226,14 @@ static int init_pd(dnnl_engine_t engine, const prb_t *prb, void check_known_skipped_case(const prb_t *prb, res_t *res) { check_known_skipped_case_common({prb->dt}, prb->dir, res); + if (res->state == SKIPPED) return; + + if (is_nvidia_gpu()) { + if (prb->ndims == 5 || prb->alg == nearest) { + res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + return; + } + } } int doit(const prb_t *prb, res_t *res) { diff --git a/tests/benchdnn/rnn/rnn.cpp b/tests/benchdnn/rnn/rnn.cpp index 72ba3293954..1fa1531846a 100644 --- a/tests/benchdnn/rnn/rnn.cpp +++ b/tests/benchdnn/rnn/rnn.cpp @@ -766,6 +766,11 @@ void check_known_skipped_case(const prb_t &prb, res_t *res) { return; } } + + if (is_nvidia_gpu()) { + res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + return; + } } int doit(const prb_t &prb, res_t *res) { diff --git a/tests/benchdnn/shuffle/shuffle.cpp b/tests/benchdnn/shuffle/shuffle.cpp index d5ad95a0d3c..b36a8766de6 100644 --- a/tests/benchdnn/shuffle/shuffle.cpp +++ b/tests/benchdnn/shuffle/shuffle.cpp @@ -146,6 +146,12 @@ static int init_pd(dnnl_engine_t engine, const prb_t *prb, void check_known_skipped_case(const prb_t *prb, res_t *res) { check_known_skipped_case_common({prb->dt}, prb->dir, res); + if (res->state == SKIPPED) return; + + if (is_nvidia_gpu()) { + res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + return; + } } int doit(const prb_t *prb, res_t *res) { diff --git a/tests/gtests/CMakeLists.txt b/tests/gtests/CMakeLists.txt index 8adbac8d657..67c8c8b3a04 100644 --- a/tests/gtests/CMakeLists.txt +++ b/tests/gtests/CMakeLists.txt @@ -185,7 +185,7 @@ endif() foreach(TEST_FILE ${PRIM_TEST_CASES_SRC}) get_filename_component(exe ${TEST_FILE} NAME_WE) - if(NOT ${exe} MATCHES "${skip_usm_pattern}") + if(NOT ${exe} MATCHES "${skip_usm_pattern}" AND NOT DNNL_SYCL_CUDA) register_gtest(${exe} ${TEST_FILE}) endif() diff --git a/tests/gtests/api/CMakeLists.txt b/tests/gtests/api/CMakeLists.txt index 424ed8a24b6..becb2595afc 100644 --- a/tests/gtests/api/CMakeLists.txt +++ b/tests/gtests/api/CMakeLists.txt @@ -19,8 +19,13 @@ set(TEST_EXE test_api) file(GLOB TEST_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/test_*.cpp) list(APPEND TEST_SOURCES ${MAIN_SRC_GTEST}) +# Switch off C API tests for CUDA since USM model is not supported +if(NOT DNNL_SYCL_CUDA) + register_exe(${TEST_EXE} "${TEST_SOURCES}" "test" "dnnl_gtest") +endif() + # Create DPC++ buffer target. -if(DNNL_SYCL_DPCPP) +if(DNNL_SYCL_DPCPP AND NOT DNNL_SYCL_CUDA) register_exe(${TEST_EXE}_buffer "${TEST_SOURCES}" "test" "dnnl_gtest") target_compile_definitions(${TEST_EXE}_buffer PUBLIC -DTEST_DNNL_DPCPP_BUFFER) endif() diff --git a/tests/gtests/api/test_memory_creation.cpp b/tests/gtests/api/test_memory_creation.cpp index 7cd9fcb9964..d8d8766f05d 100644 --- a/tests/gtests/api/test_memory_creation.cpp +++ b/tests/gtests/api/test_memory_creation.cpp @@ -53,6 +53,12 @@ class memory_creation_test_t dnnl::memory::desc md(p.dims, memory::data_type::f32, p.fmt_tag); dnnl::memory::dim phys_size = md.get_size() / sizeof(data_t); +#ifdef DNNL_SYCL_CUDA + const dnnl::impl::memory_desc_wrapper mdw(md.data); + SKIP_IF(!mdw.is_plain() && !mdw.format_any(), + "Non-plain formats are not supported on CUDA backend"); +#endif + // mem0 // Initially spoiled by putting non-zero values in padded area. // The test will manually fix it later. diff --git a/tests/gtests/api/test_namespace.cpp b/tests/gtests/api/test_namespace.cpp new file mode 100644 index 00000000000..74653c3f8b6 --- /dev/null +++ b/tests/gtests/api/test_namespace.cpp @@ -0,0 +1,29 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "dnnl_test_common.hpp" +#include "gtest/gtest.h" + +#include "oneapi/dnnl/dnnl.hpp" + +namespace dnnl { + +TEST(namespace_test, TestAliasNamespace) { + const version_t *version = ::oneapi::dnnl::version(); + (void)version; +} + +} // namespace dnnl diff --git a/tests/gtests/dnnl_test_common.hpp b/tests/gtests/dnnl_test_common.hpp index cf2a0e31ef8..71af8226694 100644 --- a/tests/gtests/dnnl_test_common.hpp +++ b/tests/gtests/dnnl_test_common.hpp @@ -87,6 +87,27 @@ dnnl::engine::kind get_test_engine_kind(); dnnl::engine get_test_engine(); #endif +inline int get_vendor_id(const std::string &vendor) { + if (vendor == "nvidia") { + return 0x10DE; + } else if (vendor == "intel") { + return 0x8086; + } else { + return -1; + } +} + +inline bool is_nvidia_gpu(const dnnl::engine &eng) { +#if DNNL_WITH_SYCL + const int nvidia_vendor_id = get_vendor_id("nvidia"); + const auto device = dnnl::sycl_interop::get_device(eng); + const auto eng_vendor_id + = device.get_info(); + return eng_vendor_id == nvidia_vendor_id; +#endif + return false; +} + inline bool unsupported_data_type(memory::data_type dt, dnnl::engine eng) { dnnl::engine::kind kind = eng.get_kind(); @@ -94,7 +115,16 @@ inline bool unsupported_data_type(memory::data_type dt, dnnl::engine eng) { if (kind == dnnl::engine::kind::cpu) supported = dnnl::impl::cpu::platform::has_data_type_support( memory::convert_to_c(dt)); - +#ifdef DNNL_SYCL_CUDA + if (is_nvidia_gpu(eng)) { + switch (dt) { + case memory::data_type::f32: return false; + case memory::data_type::f16: return false; + case memory::data_type::s8: return false; + default: return true; + } + } +#endif return !supported; } diff --git a/tests/gtests/dnnl_test_macros.hpp b/tests/gtests/dnnl_test_macros.hpp index ef18dbc9587..a1bf9e869c0 100644 --- a/tests/gtests/dnnl_test_macros.hpp +++ b/tests/gtests/dnnl_test_macros.hpp @@ -33,6 +33,27 @@ } \ } while (0) +#define SKIP_FOR_LOOP(cond, msg) \ + if (cond) { \ + std::cout << "[ SKIPPED ] " << (msg) << std::endl; \ + continue; \ + } + +#ifdef DNNL_SYCL_CUDA +#define SKIP_IF_CUDA(cond, message) \ + do { \ + SKIP_IF(get_test_engine_kind() == engine::kind::gpu && (cond), \ + (message)); \ + } while (0) + +#define SKIP_FOR_LOOP_CUDA(cond, message) \ + SKIP_FOR_LOOP( \ + get_test_engine_kind() == engine::kind::gpu && (cond), (message)); +#else +#define SKIP_IF_CUDA(cond, message) +#define SKIP_FOR_LOOP_CUDA(cond, message) +#endif + #define TEST_F_(test_fixture, test_name) TEST_F(test_fixture, test_name) #define CPU_TEST_F(test_fixture, test_name) \ diff --git a/tests/gtests/test_batch_normalization_common.hpp b/tests/gtests/test_batch_normalization_common.hpp index f8f0ff6366c..ff087a2e7bd 100644 --- a/tests/gtests/test_batch_normalization_common.hpp +++ b/tests/gtests/test_batch_normalization_common.hpp @@ -75,10 +75,34 @@ class bnorm_test_common : public ::testing::TestWithParam { protected: virtual void SetUp() { p = ::testing::TestWithParam::GetParam(); + + SKIP_IF_CUDA(!cuda_check_format_tags(p.tags.data_tag, p.tags.diff_tag), + "Unsupported format tag"); + catch_expected_failures( [=]() { Test(); }, p.expect_to_fail, p.expected_status); } + bool cuda_check_format_tags( + memory::format_tag src_format, memory::format_tag diff_format) { + bool src_ok = src_format == memory::format_tag::ncdhw + || src_format == memory::format_tag::ndhwc + || src_format == memory::format_tag::nchw + || src_format == memory::format_tag::nhwc + || src_format == memory::format_tag::ncw + || src_format == memory::format_tag::nwc + || src_format == memory::format_tag::any; + bool diff_ok = diff_format == memory::format_tag::oidhw + || diff_format == memory::format_tag::odhwi + || diff_format == memory::format_tag::oihw + || diff_format == memory::format_tag::hwio + || diff_format == memory::format_tag::oiw + || diff_format == memory::format_tag::oiw + || diff_format == memory::format_tag::any; + + return src_ok && diff_ok; + } + void Test() { using bf = normalization_flags; p = ::testing::TestWithParam::GetParam(); @@ -201,6 +225,11 @@ class bnorm_test_common : public ::testing::TestWithParam { normalization_flags flags = normalization_flags::none) { bool useScaleShift = (bool)(flags & normalization_flags::use_scale_shift); + bool useGlobalStats + = (bool)(flags & normalization_flags::use_global_stats); + (void)useGlobalStats; + + SKIP_IF_CUDA(useGlobalStats, "Global stats not supported"); auto bnorm_fwd_d = batch_normalization_forward::desc( prop_kind::forward_training, *data_d, p.epsilon, flags); @@ -251,6 +280,11 @@ class bnorm_test_common : public ::testing::TestWithParam { check_zero_tail(1, diff_src->get()); check_zero_tail(1, diff_dst->get()); + // Run a forward pass first for Nvidia backend to generate the workspace + // needed by the backward pass. + if (is_nvidia_gpu(eng)) + execBnormFwd(true, useGlobalStats, useScaleShift); + execBnormBwd(useScaleShift, pk); check_bnorm_bwd(p, src->get(), diff_dst->get(), mean, variance, weights, diff --git a/tests/gtests/test_binary.cpp b/tests/gtests/test_binary.cpp index 68381ce7046..54c92b9a886 100644 --- a/tests/gtests/test_binary.cpp +++ b/tests/gtests/test_binary.cpp @@ -50,23 +50,37 @@ class binary_test_t : public ::testing::TestWithParam { SKIP_IF(unsupported_data_type(src0_dt), "Engine does not support this data type."); + SKIP_IF(unsupported_data_type(src1_dt), + "Engine does not support this data type."); + + for (auto tag : p.srcs_format) { + MAYBE_UNUSED(tag); + SKIP_IF_CUDA(!cuda_check_format_tag(tag), + "Unsupported source format tag"); + } + SKIP_IF_CUDA(!cuda_check_format_tag(p.dst_format), + "Unsupported destination format tag"); + catch_expected_failures( [=]() { Test(); }, p.expect_to_fail, p.expected_status); } + bool cuda_check_format_tag(tag atag) { + return atag == tag::abcd || atag == tag::acdb; + } + void Test() { + auto eng = get_test_engine(); + auto strm = make_stream(eng); + // binary specific types and values using op_desc_t = binary::desc; using pd_t = binary::primitive_desc; allows_attr_t aa {false}; - aa.po_sum = true; - aa.po_eltwise = true; - aa.po_binary = true; aa.scales = true; - - auto eng = get_test_engine(); - auto strm = make_stream(eng); - + aa.po_sum = !is_nvidia_gpu(eng); + aa.po_eltwise = !is_nvidia_gpu(eng); + aa.po_binary = !is_nvidia_gpu(eng); std::vector srcs_md; std::vector srcs; diff --git a/tests/gtests/test_concat.cpp b/tests/gtests/test_concat.cpp index 6eac53033c1..05483029491 100644 --- a/tests/gtests/test_concat.cpp +++ b/tests/gtests/test_concat.cpp @@ -90,12 +90,28 @@ class concat_test_t : public ::testing::TestWithParam { } protected: + bool cuda_supported_format_tag(memory::format_tag tag) { + return impl::utils::one_of(tag, dnnl_a, dnnl_ab, dnnl_abc, dnnl_abcd, + dnnl_abcde, dnnl_abcdef, dnnl_abdec, dnnl_acb, dnnl_acbde, + dnnl_acbdef, dnnl_acdb, dnnl_acdeb, dnnl_ba, dnnl_bac, + dnnl_bacd, dnnl_bca, dnnl_bcda, dnnl_bcdea, dnnl_cba, dnnl_cdba, + dnnl_cdeba, dnnl_decab, dnnl_defcab, dnnl_aBc4b, dnnl_aBcd4b, + dnnl_aBcde4b); + } + void SetUp() override { auto data_type = data_traits::data_type; SKIP_IF(unsupported_data_type(data_type), "Engine does not support this data type."); concat_test_params_t p = ::testing::TestWithParam::GetParam(); + for (int i = 0; i < p.srcs_cds.size(); i++) { + SKIP_IF_CUDA(!cuda_supported_format_tag(p.srcs_format[i]), + "Unsupported format tag"); + } + + SKIP_IF_CUDA(!cuda_supported_format_tag(p.dst_format), + "Unsupported format tag"); catch_expected_failures( [=]() { Test(); }, p.expect_to_fail, p.expected_status, false); } diff --git a/tests/gtests/test_convolution_backward_data_common.hpp b/tests/gtests/test_convolution_backward_data_common.hpp index 9db23ff0fdc..7f3c3517284 100644 --- a/tests/gtests/test_convolution_backward_data_common.hpp +++ b/tests/gtests/test_convolution_backward_data_common.hpp @@ -92,10 +92,55 @@ class convolution_backward_data_test virtual void SetUp() { auto p = ::testing::TestWithParam< test_convolution_params_t>::GetParam(); + + SKIP_IF_CUDA( + !(cuda_check_format_tags(p.formats.src_format) + && cuda_check_format_tags(p.formats.dst_format) + && (cuda_check_format_tags(p.formats.weights_format) + || (impl::utils::one_of( + p.formats.weights_format, + /* weights formats */ + memory::format_tag::gowi, + memory::format_tag::gohwi, + memory::format_tag::godhwi, + memory::format_tag::owi, + memory::format_tag::ohwi, + memory::format_tag::odhwi))) + && data_traits::data_type + == memory::data_type::f32 + && data_traits::data_type + == memory::data_type::f32 + && data_traits::data_type + == memory::data_type::f32 + && check_cuda_alg_format(p.formats.dst_format, + p.formats.weights_format, p.aalgorithm)), + "format is not supported."); + catch_expected_failures( [=]() { Test(); }, p.expect_to_fail, p.expected_status); } + bool cuda_check_format_tags(memory::format_tag tag) { + return impl::utils::one_of(tag, memory::format_tag::ab, + memory::format_tag::abc, memory::format_tag::abcd, + memory::format_tag::abcde, memory::format_tag::abcdef, + memory::format_tag::acb, memory::format_tag::acdb, + memory::format_tag::acdeb); + } + + bool check_cuda_alg_format(memory::format_tag dst_fmt, + memory::format_tag wei_fmt, algorithm alg) { + bool res = dst_fmt == wei_fmt; + if (alg == dnnl::algorithm::convolution_winograd) { + res = res + && impl::utils::one_of(wei_fmt, memory::format_tag::ab, + memory::format_tag::abc, memory::format_tag::abcd, + memory::format_tag::abcde, + memory::format_tag::abcdef); + } + return res; + } + void Test() { auto p = ::testing::TestWithParam< test_convolution_params_t>::GetParam(); diff --git a/tests/gtests/test_convolution_backward_weights_common.hpp b/tests/gtests/test_convolution_backward_weights_common.hpp index fc6a66575bc..553aa7b1b84 100644 --- a/tests/gtests/test_convolution_backward_weights_common.hpp +++ b/tests/gtests/test_convolution_backward_weights_common.hpp @@ -124,10 +124,55 @@ class convolution_backward_weights_test virtual void SetUp() { auto p = ::testing::TestWithParam< test_convolution_params_t>::GetParam(); + + SKIP_IF_CUDA( + !(cuda_check_format_tags(p.formats.src_format) + && cuda_check_format_tags(p.formats.dst_format) + && (cuda_check_format_tags(p.formats.weights_format) + || (impl::utils::one_of( + p.formats.weights_format, + /* weights formats */ + memory::format_tag::gowi, + memory::format_tag::gohwi, + memory::format_tag::godhwi, + memory::format_tag::owi, + memory::format_tag::ohwi, + memory::format_tag::odhwi))) + && data_traits::data_type + == memory::data_type::f32 + && data_traits::data_type + == memory::data_type::f32 + && data_traits::data_type + == memory::data_type::f32 + && check_cuda_alg_format(p.formats.dst_format, + p.formats.weights_format, p.aalgorithm)), + "format is not supported."); + catch_expected_failures( [=]() { Test(); }, p.expect_to_fail, p.expected_status); } + bool cuda_check_format_tags(memory::format_tag tag) { + return impl::utils::one_of(tag, memory::format_tag::ab, + memory::format_tag::abc, memory::format_tag::abcd, + memory::format_tag::abcde, memory::format_tag::abcdef, + memory::format_tag::acb, memory::format_tag::acdb, + memory::format_tag::acdeb); + } + + bool check_cuda_alg_format(memory::format_tag dst_fmt, + memory::format_tag wei_fmt, algorithm alg) { + bool res = dst_fmt == wei_fmt; + if (alg == dnnl::algorithm::convolution_winograd) { + res = res + && impl::utils::one_of(wei_fmt, memory::format_tag::ab, + memory::format_tag::abc, memory::format_tag::abcd, + memory::format_tag::abcde, + memory::format_tag::abcdef); + } + return res; + } + void Test() { auto p = ::testing::TestWithParam< test_convolution_params_t>::GetParam(); diff --git a/tests/gtests/test_convolution_eltwise_forward_common.hpp b/tests/gtests/test_convolution_eltwise_forward_common.hpp index f3963382bd9..04fe6984658 100644 --- a/tests/gtests/test_convolution_eltwise_forward_common.hpp +++ b/tests/gtests/test_convolution_eltwise_forward_common.hpp @@ -115,12 +115,59 @@ class convolution_eltwise_test : public ::testing::TestWithParam { protected: virtual void SetUp() { + memory::data_type data_type_src = data_traits::data_type; + memory::data_type data_type_dst = data_traits::data_type; + memory::data_type data_type_wei = data_traits::data_type; + + SKIP_IF(unsupported_data_type(data_type_src), + "Engine does not support this data type."); + SKIP_IF(unsupported_data_type(data_type_dst), + "Engine does not support this data type."); + SKIP_IF(unsupported_data_type(data_type_wei), + "Engine does not support this data type."); + test_convolution_eltwise_params_t p = ::testing::TestWithParam< test_convolution_eltwise_params_t>::GetParam(); + + SKIP_IF_CUDA( + !(cuda_check_format_tags(p.formats.src_format, data_type_src) + && cuda_check_format_tags( + p.formats.dst_format, data_type_dst) + && (cuda_check_format_tags( + p.formats.weights_format, data_type_wei) + || impl::utils::one_of(p.formats.weights_format, + /* weights formats */ + memory::format_tag::gowi, + memory::format_tag::gohwi, + memory::format_tag::godhwi, + memory::format_tag::owi, + memory::format_tag::ohwi, + memory::format_tag::odhwi))), + "Format is not supported."); + SKIP_IF_CUDA(p.alg != algorithm::eltwise_relu + && p.alg != algorithm::eltwise_bounded_relu + && p.alg != algorithm::eltwise_tanh + && p.alg != algorithm::eltwise_elu + && p.alg != algorithm::eltwise_logistic, + "Unsupported algorithm type for CUDA"); + SKIP_IF_CUDA(p.alg == algorithm::eltwise_relu && p.eltwise_alpha != 0.0, + "DNNL only supports relu w/ slope=0 for integers"); + catch_expected_failures( [=]() { Test(); }, p.expect_to_fail, p.expected_status); } + bool cuda_check_format_tags(memory::format_tag tag, memory::data_type dt) { + return ((impl::utils::one_of(tag, memory::format_tag::ab, + memory::format_tag::abc, memory::format_tag::abcd, + memory::format_tag::abcde, memory::format_tag::abcdef, + memory::format_tag::acb, memory::format_tag::acdb, + memory::format_tag::acdeb)) + || (dt == memory::data_type::s8 + && impl::utils::one_of(tag, memory::format_tag::aBcd4b, + memory::format_tag::aBcde4b))); + } + virtual void Test() { test_convolution_eltwise_params_t p = ::testing::TestWithParam< test_convolution_eltwise_params_t>::GetParam(); @@ -186,6 +233,9 @@ class convolution_eltwise_test ++padR[1]; } + SKIP_IF_CUDA(cd.padh < padR[0] || cd.padw < padR[1], + "Unsupported padding for CUDA."); + dnnl::post_ops ops; ops.append_eltwise(1.0, p.alg, p.eltwise_alpha, p.eltwise_beta); diff --git a/tests/gtests/test_convolution_format_any.cpp b/tests/gtests/test_convolution_format_any.cpp index 32a73f5e914..eab4c2ace8e 100644 --- a/tests/gtests/test_convolution_format_any.cpp +++ b/tests/gtests/test_convolution_format_any.cpp @@ -59,6 +59,9 @@ class convolution_any_fmt_test_t ASSERT_EQ(p.aalgorithm, algorithm::convolution_direct); auto eng = get_test_engine(); memory::data_type data_type = data_traits::data_type; + SKIP_IF_CUDA((p.expected_src_fmt == data_fmt_t::blocked_cX + || p.expected_dst_fmt == data_fmt_t::blocked_cX), + "unsupported format"); ASSERT_EQ(data_type, dnnl::memory::data_type::f32); test_convolution_sizes_t cd = p.test_cd; diff --git a/tests/gtests/test_convolution_forward_common.hpp b/tests/gtests/test_convolution_forward_common.hpp index 35c98c689ba..0640ccc7718 100644 --- a/tests/gtests/test_convolution_forward_common.hpp +++ b/tests/gtests/test_convolution_forward_common.hpp @@ -104,12 +104,51 @@ class convolution_forward_test : public ::testing::TestWithParam { protected: virtual void SetUp() { + memory::data_type data_type_src = data_traits::data_type; + memory::data_type data_type_dst = data_traits::data_type; + memory::data_type data_type_wei = data_traits::data_type; + + SKIP_IF(unsupported_data_type(data_type_src), + "Engine does not support this data type."); + SKIP_IF(unsupported_data_type(data_type_dst), + "Engine does not support this data type."); + SKIP_IF(unsupported_data_type(data_type_wei), + "Engine does not support this data type."); + auto p = ::testing::TestWithParam< test_convolution_params_t>::GetParam(); + + SKIP_IF_CUDA( + !(cuda_check_format_tags(p.formats.src_format, data_type_src) + && cuda_check_format_tags( + p.formats.dst_format, data_type_dst) + && (cuda_check_format_tags( + p.formats.weights_format, data_type_wei) + || impl::utils::one_of(p.formats.weights_format, + /* weights formats */ + memory::format_tag::gowi, + memory::format_tag::gohwi, + memory::format_tag::godhwi, + memory::format_tag::owi, + memory::format_tag::ohwi, + memory::format_tag::odhwi))), + "Format is not supported."); + catch_expected_failures( [=]() { Test(); }, p.expect_to_fail, p.expected_status); } + bool cuda_check_format_tags(memory::format_tag tag, memory::data_type dt) { + return ((impl::utils::one_of(tag, memory::format_tag::ab, + memory::format_tag::abc, memory::format_tag::abcd, + memory::format_tag::abcde, memory::format_tag::abcdef, + memory::format_tag::acb, memory::format_tag::acdb, + memory::format_tag::acdeb)) + || (dt == memory::data_type::s8 + && impl::utils::one_of(tag, memory::format_tag::aBcd4b, + memory::format_tag::aBcde4b))); + } + void Test() { auto p = ::testing::TestWithParam< test_convolution_params_t>::GetParam(); diff --git a/tests/gtests/test_deconvolution.cpp b/tests/gtests/test_deconvolution.cpp index 81fc18399f0..76a4d09712c 100644 --- a/tests/gtests/test_deconvolution.cpp +++ b/tests/gtests/test_deconvolution.cpp @@ -131,12 +131,45 @@ class deconvolution_test_t protected: void SetUp() override { + memory::data_type data_type = data_traits::data_type; + SKIP_IF(unsupported_data_type(data_type), + "Engine does not support this data type."); + auto p = ::testing::TestWithParam< deconvolution_test_params_t>::GetParam(); + + SKIP_IF_CUDA( + !(cuda_check_format_tags(p.formats.src_format, data_type) + && cuda_check_format_tags( + p.formats.dst_format, data_type) + && cuda_check_src_wei_format_tags(p.formats.src_format, + p.formats.weights_format, p.sizes.ng > 1)), + "Format is not supported."); + catch_expected_failures( [=]() { Test(); }, p.expect_to_fail, p.expected_status); } + bool cuda_check_format_tags(memory::format_tag tag, memory::data_type dt) { + return ((impl::utils::one_of(tag, memory::format_tag::ab, + memory::format_tag::abc, memory::format_tag::abcd, + memory::format_tag::abcde, memory::format_tag::abcdef, + memory::format_tag::acb, memory::format_tag::acdb, + memory::format_tag::acdeb)) + || (dt == memory::data_type::s8 + && impl::utils::one_of(tag, memory::format_tag::aBcd4b, + memory::format_tag::aBcde4b))); + } + + bool cuda_check_src_wei_format_tags( + memory::format_tag src, memory::format_tag wei, bool is_grouped) { + if (src == memory::format_tag::abcd) return true; + if (src == memory::format_tag::acdb) + return wei + != (is_grouped ? memory::format_tag::abcde + : memory::format_tag::abcd); + } + void Test() { auto p = ::testing::TestWithParam< deconvolution_test_params_t>::GetParam(); @@ -190,6 +223,8 @@ class deconvolution_test_t padR = {right_padding(dd.oh, dd.ih, dd.kh, dd.padh, dd.strh, dd.dilh), right_padding(dd.ow, dd.iw, dd.kw, dd.padw, dd.strw, dd.dilw)}; + SKIP_IF_CUDA(p.sizes.padh < padR[0] || p.sizes.padw < padR[1], + "Padding not supported"); Forward(); BackwardData(); BackwardWeights(); @@ -529,6 +564,6 @@ GPU_INST_TEST_CASE(SimpleSmall_NHWC, PARAMS(nhwc, ohwi, x, nhwc, 2, 1, 6, 4, 4, 4, 4, 4, 3, 3, 1, 1, 1, 1), PARAMS(nhwc, ohwi, x, nhwc, 2, 1, 6, 2, 2, 4, 4, 4, 3, 3, 0, 0, 1, 1), PARAMS(nchw, goihw, x, nchw, 2, 2, 6, 4, 4, 4, 4, 4, 3, 3, 1, 1, 1, 1), - PARAMS(nchw, goihw, x, nhwc, 2, 2, 6, 4, 4, 4, 4, 4, 3, 3, 1, 1, 1, 1)); - + PARAMS(nchw, goihw, x, nhwc, 2, 2, 6, 4, 4, 4, 4, 4, 3, 3, 1, 1, 1, 1), + PARAMS(nhwc, gohwi, x, nhwc, 2, 2, 6, 4, 4, 4, 4, 4, 3, 3, 1, 1, 1, 1)); } // namespace dnnl diff --git a/tests/gtests/test_eltwise.cpp b/tests/gtests/test_eltwise.cpp index 970a167dec8..aeb595cc736 100644 --- a/tests/gtests/test_eltwise.cpp +++ b/tests/gtests/test_eltwise.cpp @@ -348,10 +348,30 @@ class eltwise_test_t : public ::testing::TestWithParam { && (data_type == memory::data_type::s32 || data_type == memory::data_type::s8), "oneDNN only supports relu w/ slope=0 for integers"); + SKIP_IF_CUDA(p.alg_kind != algorithm::eltwise_relu + && p.alg_kind != algorithm::eltwise_bounded_relu + && p.alg_kind != algorithm::eltwise_tanh + && p.alg_kind != algorithm::eltwise_elu + && p.alg_kind != algorithm::eltwise_logistic, + "Unsupported algorithm type for CUDA"); + SKIP_IF_CUDA(p.alg_kind == algorithm::eltwise_relu && p.alpha != 0.0, + "DNNL only supports relu w/ slope=0 for integers"); + SKIP_IF_CUDA(!cuda_check_format_tag(p.data_format), + "Unsupported format tag"); + SKIP_IF_CUDA(!cuda_check_format_tag(p.diff_format), + "Unsupported format tag"); catch_expected_failures( [=]() { Test(); }, p.expect_to_fail, p.expected_status); } + bool cuda_check_format_tag(memory::format_tag tag) { + // Blocking is not supported by cuDNN + return (tag != memory::format_tag::aBcd8b + && tag != memory::format_tag::aBcd16b + && tag != memory::format_tag::aBcde8b + && tag != memory::format_tag::aBcde16b); + } + void Test() { p = ::testing::TestWithParam::GetParam(); @@ -404,6 +424,12 @@ class eltwise_test_t : public ::testing::TestWithParam { } void Backward() { + SKIP_IF_CUDA(p.alg_kind != algorithm::eltwise_relu + && p.alg_kind != algorithm::eltwise_bounded_relu, + "Unsupported algorithm"); + SKIP_IF_CUDA(p.diff_format != p.data_format, + "CUDA does not support different data formats for data and " + "diff vectors"); memory::desc diff_data_desc(p.dims, data_type, p.diff_format); auto diff_src = test::make_memory(diff_data_desc, eng); auto diff_dst = test::make_memory(diff_data_desc, eng); diff --git a/tests/gtests/test_gemm_common.hpp b/tests/gtests/test_gemm_common.hpp index 0a882a61ac4..10bd347698c 100644 --- a/tests/gtests/test_gemm_common.hpp +++ b/tests/gtests/test_gemm_common.hpp @@ -1007,6 +1007,7 @@ class gemm_test_common : public ::testing::TestWithParam { || data_traits::data_type == memory::data_type::s8), "SYCL GPU int GEMM not implemented."); + SKIP_IF_CUDA(true, "Test not supported in CUDA backend"); #endif bool is_bf16bf16f32 = true diff --git a/tests/gtests/test_iface_pd_iter.cpp b/tests/gtests/test_iface_pd_iter.cpp index edae3d767a2..b89a90ea914 100644 --- a/tests/gtests/test_iface_pd_iter.cpp +++ b/tests/gtests/test_iface_pd_iter.cpp @@ -68,6 +68,7 @@ TEST_F(pd_iter_test_t, TestReLUImpls) { } TEST(pd_next_impl, TestEltwiseImpl) { + SKIP_IF_CUDA(true, "Unsupported memory format for CUDA"); auto eng = get_test_engine(); memory::desc md( {8, 32, 4, 4}, memory::data_type::f32, memory::format_tag::nChw8c); diff --git a/tests/gtests/test_iface_runtime_attr.cpp b/tests/gtests/test_iface_runtime_attr.cpp index cf2a67947c1..e7619684f8e 100644 --- a/tests/gtests/test_iface_runtime_attr.cpp +++ b/tests/gtests/test_iface_runtime_attr.cpp @@ -121,6 +121,8 @@ TEST_F(runtime_attr_test_t, TestConcat) { } TEST_F(runtime_attr_test_t, TestConv) { + // Datatype u8 is not supported in the Nvidia backend + SKIP_IF_CUDA(true, "Unsupported datatype for CUDA"); memory::desc src_md {{1, 16, 7, 7}, data_type::u8, tag::any}; memory::desc wei_md {{32, 16, 3, 3}, data_type::s8, tag::any}; memory::desc dst_md {{1, 32, 7, 7}, data_type::s32, tag::any}; @@ -201,6 +203,8 @@ TEST_F(runtime_attr_test_t, TestEltwise) { } TEST_F(runtime_attr_test_t, TestInnerProduct) { + // Datatype u8 is not supported in the Nvidia backend + SKIP_IF_CUDA(true, "Unsupported datatype for CUDA"); memory::desc src_md {{1, 16, 7, 7}, data_type::u8, tag::any}; memory::desc wei_md {{32, 16, 7, 7}, data_type::s8, tag::any}; memory::desc dst_md {{1, 32}, data_type::s32, tag::any}; @@ -222,6 +226,7 @@ TEST_F(runtime_attr_test_t, TestInnerProduct) { } TEST_F(runtime_attr_test_t, TestLNorm) { + SKIP_IF_CUDA(true, "Layer normalization primitive not supported for CUDA"); for (auto dt : {data_type::f32}) { memory::desc md {{1, 16, 16}, dt, tag::abc}; memory::desc stat_md {{1, 16}, data_type::f32, tag::ab}; @@ -353,6 +358,8 @@ CPU_TEST_F(runtime_attr_test_t, TestReorder) { } TEST_F(runtime_attr_test_t, TestRNN) { + SKIP_IF_CUDA(true, "RNN primitive not supported for CUDA"); + #if !DNNL_X64 return; #endif @@ -399,6 +406,7 @@ TEST_F(runtime_attr_test_t, TestRNN) { } TEST_F(runtime_attr_test_t, TestShuffle) { + SKIP_IF_CUDA(true, "Shuffle primitive not supported for CUDA"); memory::desc md {{1, 16, 3, 3}, data_type::f32, tag::abcd}; shuffle_forward::desc op_d(prop_kind::forward, md, 1, 4); CHECK_OK(shuffle_forward::primitive_desc(op_d, eng)); diff --git a/tests/gtests/test_iface_wino_convolution.cpp b/tests/gtests/test_iface_wino_convolution.cpp index 6d8b33aafb6..5625a33a443 100644 --- a/tests/gtests/test_iface_wino_convolution.cpp +++ b/tests/gtests/test_iface_wino_convolution.cpp @@ -122,9 +122,7 @@ TEST_F(wino_conv_test_t, TestLargePadding) { algorithm::convolution_winograd, src_md, wei_md, dst_md, {1, 1}, {2, 2}, {2, 2}); - // oneDNN backend does not support pad != 1 for Wino conv, which may - // not be the case for other backends. - bool large_pad_is_supported = false; + bool large_pad_is_supported = is_nvidia_gpu(eng) ? true : false; if (input.wino_supported && large_pad_is_supported) { EXPECT_NO_THROW( convolution_forward::primitive_desc(fwd_op_desc, eng)); diff --git a/tests/gtests/test_inner_product_backward_data.cpp b/tests/gtests/test_inner_product_backward_data.cpp index 9ec85f3a682..98892455489 100644 --- a/tests/gtests/test_inner_product_backward_data.cpp +++ b/tests/gtests/test_inner_product_backward_data.cpp @@ -96,12 +96,43 @@ class inner_product_test_bwd_data_t protected: void SetUp() override { auto p = ::testing::TestWithParam::GetParam(); + SKIP_IF_CUDA(!cuda_check_format_tags(p.diff_src_format, + p.weights_format, p.diff_dst_format), + "Unsupported format tag"); + SKIP_IF_CUDA(p.ndims > 5, "Unsupported number of dimensions"); catch_expected_failures( [=]() { Test(); }, p.expect_to_fail, p.expected_status); } - void Test() { + bool cuda_check_format_tags(memory::format_tag diff_src_format, + memory::format_tag wei_format, memory::format_tag diff_dst_format) { + bool diff_src_ok = diff_src_format == memory::format_tag::ncdhw + || diff_src_format == memory::format_tag::ndhwc + || diff_src_format == memory::format_tag::nchw + || diff_src_format == memory::format_tag::nhwc + || diff_src_format == memory::format_tag::ncw + || diff_src_format == memory::format_tag::nwc + || diff_src_format == memory::format_tag::nc + || diff_src_format == memory::format_tag::any; + bool wei_ok = wei_format == memory::format_tag::oidhw + || wei_format == memory::format_tag::odhwi + || wei_format == memory::format_tag::dhwio + || wei_format == memory::format_tag::oihw + || wei_format == memory::format_tag::hwio + || wei_format == memory::format_tag::ohwi + || wei_format == memory::format_tag::oiw + || wei_format == memory::format_tag::owi + || wei_format == memory::format_tag::wio + || wei_format == memory::format_tag::io + || wei_format == memory::format_tag::oi + || wei_format == memory::format_tag::any; + bool diff_dst_ok = diff_dst_format == memory::format_tag::any + || diff_dst_format == memory::format_tag::nc; + + return diff_src_ok && wei_ok && diff_dst_ok; + } + void Test() { auto p = ::testing::TestWithParam::GetParam(); test_inner_product_descr_t ipd = p.test_ipd; bool has_spatial = ipd.kh > 1 || ipd.kw > 1; diff --git a/tests/gtests/test_inner_product_backward_weights.cpp b/tests/gtests/test_inner_product_backward_weights.cpp index 4474277b366..46632c96d31 100644 --- a/tests/gtests/test_inner_product_backward_weights.cpp +++ b/tests/gtests/test_inner_product_backward_weights.cpp @@ -123,10 +123,49 @@ class inner_product_test_bwd_weights_t protected: void SetUp() override { auto p = ::testing::TestWithParam::GetParam(); + SKIP_IF_CUDA( + !cuda_check_format_tags(p.src_format, p.diff_weights_format, + p.diff_bias_format, p.diff_dst_format), + "Unsupported format tag"); + SKIP_IF_CUDA(p.ndims > 5, "Unsupported number of dimensions"); catch_expected_failures( [=]() { Test(); }, p.expect_to_fail, p.expected_status); } + bool cuda_check_format_tags(memory::format_tag src_format, + memory::format_tag diff_wei_format, + memory::format_tag diff_bia_format, + memory::format_tag diff_dst_format) { + bool src_ok = src_format == memory::format_tag::ncdhw + || src_format == memory::format_tag::ndhwc + || src_format == memory::format_tag::nchw + || src_format == memory::format_tag::nhwc + || src_format == memory::format_tag::ncw + || src_format == memory::format_tag::nwc + || src_format == memory::format_tag::nc + || src_format == memory::format_tag::any; + bool diff_wei_ok = diff_wei_format == memory::format_tag::oidhw + || diff_wei_format == memory::format_tag::odhwi + || diff_wei_format == memory::format_tag::dhwio + || diff_wei_format == memory::format_tag::oihw + || diff_wei_format == memory::format_tag::ohwi + || diff_wei_format == memory::format_tag::hwio + || diff_wei_format == memory::format_tag::oiw + || diff_wei_format == memory::format_tag::owi + || diff_wei_format == memory::format_tag::wio + || diff_wei_format == memory::format_tag::io + || diff_wei_format == memory::format_tag::oi + || diff_wei_format == memory::format_tag::any; + bool diff_bia_ok = diff_bia_format == memory::format_tag::undef + || diff_bia_format == memory::format_tag::any + || diff_bia_format == memory::format_tag::a + || diff_bia_format == memory::format_tag::x; + bool diff_dst_ok = diff_dst_format == memory::format_tag::any + || diff_dst_format == memory::format_tag::nc; + + return src_ok && diff_wei_ok && diff_bia_ok && diff_dst_ok; + } + void Test() { auto p = ::testing::TestWithParam::GetParam(); test_inner_product_descr_t ipd = p.test_ipd; diff --git a/tests/gtests/test_inner_product_forward.cpp b/tests/gtests/test_inner_product_forward.cpp index 0539744553a..da90ac3fc78 100644 --- a/tests/gtests/test_inner_product_forward.cpp +++ b/tests/gtests/test_inner_product_forward.cpp @@ -88,10 +88,47 @@ class inner_product_test_t protected: void SetUp() override { auto p = ::testing::TestWithParam::GetParam(); + SKIP_IF_CUDA(!cuda_check_format_tags(p.src_format, p.weights_format, + p.bias_format, p.dst_format), + "Unsupported format tag"); + SKIP_IF_CUDA(p.ndims > 5, "Unsupported number of dimensions"); catch_expected_failures( [=]() { Test(); }, p.expect_to_fail, p.expected_status); } + bool cuda_check_format_tags(memory::format_tag src_format, + memory::format_tag wei_format, memory::format_tag bia_format, + memory::format_tag dst_format) { + bool src_ok = src_format == memory::format_tag::ncdhw + || src_format == memory::format_tag::ndhwc + || src_format == memory::format_tag::nchw + || src_format == memory::format_tag::nhwc + || src_format == memory::format_tag::ncw + || src_format == memory::format_tag::nwc + || src_format == memory::format_tag::nc + || src_format == memory::format_tag::any; + bool wei_ok = wei_format == memory::format_tag::oidhw + || wei_format == memory::format_tag::odhwi + || wei_format == memory::format_tag::dhwio + || wei_format == memory::format_tag::oihw + || wei_format == memory::format_tag::ohwi + || wei_format == memory::format_tag::hwio + || wei_format == memory::format_tag::oiw + || wei_format == memory::format_tag::owi + || wei_format == memory::format_tag::wio + || wei_format == memory::format_tag::io + || wei_format == memory::format_tag::oi + || wei_format == memory::format_tag::any; + bool bia_ok = bia_format == memory::format_tag::undef + || bia_format == memory::format_tag::any + || bia_format == memory::format_tag::a + || bia_format == memory::format_tag::x; + bool dst_ok = dst_format == memory::format_tag::any + || dst_format == memory::format_tag::nc; + + return src_ok && wei_ok && bia_ok && dst_ok; + } + void Test() { auto p = ::testing::TestWithParam::GetParam(); test_inner_product_descr_t ipd = p.test_ipd; diff --git a/tests/gtests/test_layer_normalization.cpp b/tests/gtests/test_layer_normalization.cpp index f3fe8b600f1..d7aa9ad7f31 100644 --- a/tests/gtests/test_layer_normalization.cpp +++ b/tests/gtests/test_layer_normalization.cpp @@ -62,6 +62,7 @@ class lnorm_test_t : public ::testing::TestWithParam { protected: void SetUp() override { + SKIP_IF_CUDA(true, "Layer normalization not supported by CUDA."); p = ::testing::TestWithParam::GetParam(); catch_expected_failures( [=]() { Test(); }, p.expect_to_fail, p.expected_status); diff --git a/tests/gtests/test_logsoftmax.cpp b/tests/gtests/test_logsoftmax.cpp index 775ec533d90..aa717a3d56f 100644 --- a/tests/gtests/test_logsoftmax.cpp +++ b/tests/gtests/test_logsoftmax.cpp @@ -46,11 +46,22 @@ class logsoftmax_test_t protected: void SetUp() override { + data_dt = data_traits::data_type; + p = ::testing::TestWithParam< logsoftmax_test_params_t>::GetParam(); + SKIP_IF(unsupported_data_type(data_dt), + "Engine does not support this data type."); + SKIP_IF_CUDA(!cuda_check_format_tag(p.memory_format), + "Unsupported format tag"); + SKIP_IF_CUDA(p.axis != 1, "Unsupported axis values for CUDA"); catch_expected_failures( [=]() { Test(); }, p.expect_to_fail, p.expected_status); } + bool cuda_check_format_tag(memory::format_tag tag) { + return (tag != memory::format_tag::aBcd8b + && tag != memory::format_tag::aBc16b); + } void Forward() { // logsoftmax specific types and values @@ -140,6 +151,8 @@ class logsoftmax_test_t auto eng = get_test_engine(); auto strm = make_stream(eng); auto prec = data_traits::data_type; + SKIP_IF_CUDA(prec == memory::data_type::bf16, + "Unsupported datatype for CUDA"); auto mem_desc = memory::desc(p.dims, prec, p.memory_format); auto diff_mem_desc = memory::desc(p.dims, prec, p.diff_memory_format); diff --git a/tests/gtests/test_lrn_backward.cpp b/tests/gtests/test_lrn_backward.cpp index 1a341281e02..8f314a3bff8 100644 --- a/tests/gtests/test_lrn_backward.cpp +++ b/tests/gtests/test_lrn_backward.cpp @@ -208,21 +208,37 @@ class lrn_test_t : public ::testing::TestWithParam { void SetUp() override { data_type = data_traits::data_type; - SKIP_IF(data_type == memory::data_type::bf16 - && get_test_engine_kind() == engine::kind::gpu, - "GPU does not support bf16 data type."); SKIP_IF(unsupported_data_type(data_type), "Engine does not support this data type."); p = ::testing::TestWithParam::GetParam(); - + SKIP_IF_CUDA(!cuda_check_format_tags(p.data_format, p.diff_data_format), + "Unsupported format tag"); + SKIP_IF_CUDA(p.aalgorithm != algorithm::lrn_across_channels, + "Unsupported algorithm"); ASSERT_TRUE(p.aalgorithm == algorithm::lrn_across_channels || p.aalgorithm == algorithm::lrn_within_channel); catch_expected_failures( [=]() { Test(); }, p.expect_to_fail, p.expected_status); } - + bool cuda_check_format_tags(memory::format_tag data_format, + memory::format_tag diff_data_format) { + bool data_ok = data_format == memory::format_tag::ncdhw + || data_format == memory::format_tag::nchw + || data_format == memory::format_tag::nhwc + || data_format == memory::format_tag::ncw + || data_format == memory::format_tag::nwc + || data_format == memory::format_tag::any; + bool diff_data_ok = diff_data_format == memory::format_tag::ncdhw + || diff_data_format == memory::format_tag::nchw + || diff_data_format == memory::format_tag::nhwc + || diff_data_format == memory::format_tag::ncw + || diff_data_format == memory::format_tag::nwc + || diff_data_format == memory::format_tag::any; + + return data_ok && diff_data_ok; + } void Test() { p = ::testing::TestWithParam::GetParam(); diff --git a/tests/gtests/test_lrn_forward.cpp b/tests/gtests/test_lrn_forward.cpp index 2ed569ca43b..d6206fd8996 100644 --- a/tests/gtests/test_lrn_forward.cpp +++ b/tests/gtests/test_lrn_forward.cpp @@ -122,9 +122,22 @@ class lrn_forward_test_t : public ::testing::TestWithParam { SKIP_IF(unsupported_data_type(data_type), "Engine does not support this data type."); p = ::testing::TestWithParam::GetParam(); + SKIP_IF_CUDA( + !cuda_check_format_tags(p.format), "Unsupported format tag"); + SKIP_IF_CUDA(p.aalgorithm != algorithm::lrn_across_channels, + "Unsupported algorithm"); catch_expected_failures( [=]() { Test(); }, p.expect_to_fail, p.expected_status); } + bool cuda_check_format_tags(memory::format_tag format) { + bool ok = format == memory::format_tag::ncdhw + || format == memory::format_tag::nchw + || format == memory::format_tag::nhwc + || format == memory::format_tag::ncw + || format == memory::format_tag::nwc + || format == memory::format_tag::any; + return ok; + } void Test() { ASSERT_TRUE(p.aprop_kind == prop_kind::forward_training diff --git a/tests/gtests/test_matmul.cpp b/tests/gtests/test_matmul.cpp index 226debdbc5e..44b329d3c94 100644 --- a/tests/gtests/test_matmul.cpp +++ b/tests/gtests/test_matmul.cpp @@ -95,6 +95,19 @@ class matmul_iface_test_t SKIP_IF(unsupported_data_type(p.base.src.dt), "Engine does not support this data type."); + SKIP_IF(unsupported_data_type(p.base.weights.dt), + "Engine does not support this data type."); + SKIP_IF(unsupported_data_type(p.base.dst.dt), + "Engine does not support this data type."); + SKIP_IF(unsupported_data_type(p.base.bia_dt), + "Engine does not support this data type."); + + SKIP_IF_CUDA((p.attr.zero_points.src != 0 || p.attr.zero_points.dst != 0 + || p.attr.zero_points.weights != 0), + "Zero points not supported for CUDA"); + + SKIP_IF_CUDA((p.attr.scale_flags & P::MASK_MASK) == P::PER_N, + "Per dimensional scaling is not supported for CUDA"); catch_expected_failures( [=]() { Test(); }, p.expect_to_fail, p.expected_status, false); @@ -373,7 +386,8 @@ static auto cases_ef = []() { {{10, 20}, data_type::f32, tag::ab}, data_type::u8}, {}, true, dnnl_unimplemented}); // XXX: disable assert in type_helpers.hpp: default_accum_data_type(...) - //cases.push_back({{{{10, 1}, data_type::u8, tag::ab}, {{1, 20}, data_type::u8, tag::ab}, + // cases.push_back({{{{10, 1}, data_type::u8, tag::ab}, {{1, 20}, + // data_type::u8, tag::ab}, // {{10, 20}, data_type::u8, tag::ab}}, // {}, true, dnnl_unimplemented}); diff --git a/tests/gtests/test_pooling_backward.cpp b/tests/gtests/test_pooling_backward.cpp index 2cbf1421712..a5d84f203fb 100644 --- a/tests/gtests/test_pooling_backward.cpp +++ b/tests/gtests/test_pooling_backward.cpp @@ -40,6 +40,19 @@ struct pool_bwd_test_params_t { dnnl_status_t expected_status; }; +bool cuda_check_format_tags(memory::format_tag format) { + bool format_ok = format == memory::format_tag::ncdhw + || format == memory::format_tag::ndhwc + || format == memory::format_tag::nchw + || format == memory::format_tag::nhwc + || format == memory::format_tag::ncw + || format == memory::format_tag::nwc + || format == memory::format_tag::any + || format == memory::format_tag::nCdhw4c; + + return format_ok; +} + template void check_pool_fwd( const pool_bwd_test_params_t &p, const memory &src, const memory &dst) { @@ -266,6 +279,12 @@ class pooling_bwd_test_t protected: void SetUp() override { p = ::testing::TestWithParam::GetParam(); + SKIP_IF_CUDA(!cuda_check_format_tags(p.diff_src_format), + "Unsupported format tag"); + SKIP_IF_CUDA(!cuda_check_format_tags(p.diff_dst_format), + "Unsupported format tag"); + SKIP_IF_CUDA(p.aalgorithm == algorithm::pooling_max, + "Unsupported algorithm MAX"); catch_expected_failures( [=]() { Test(); }, p.expect_to_fail, p.expected_status); } diff --git a/tests/gtests/test_pooling_forward.cpp b/tests/gtests/test_pooling_forward.cpp index d2562459265..6b8247c82aa 100644 --- a/tests/gtests/test_pooling_forward.cpp +++ b/tests/gtests/test_pooling_forward.cpp @@ -42,6 +42,19 @@ struct pool_test_params_t { dnnl_status_t expected_status; }; +bool cuda_check_format_tags(memory::format_tag format) { + bool format_ok = format == memory::format_tag::ncdhw + || format == memory::format_tag::ndhwc + || format == memory::format_tag::nchw + || format == memory::format_tag::nhwc + || format == memory::format_tag::ncw + || format == memory::format_tag::nwc + || format == memory::format_tag::any + || format == memory::format_tag::nCdhw4c; + + return format_ok; +} + template void check_pool_fwd(const pool_test_params_t &p, const memory &src, const memory &dst, const memory &ws) { @@ -69,6 +82,8 @@ void check_pool_fwd(const pool_test_params_t &p, const memory &src, auto pd = p.test_pd; size_t padded_c = src_d.data.padded_dims[1]; + const bool is_cudnn_gpu = is_nvidia_gpu(src.get_engine()); + dnnl::impl::parallel_nd(pd.mb, pd.c, pd.od, pd.oh, pd.ow, [&](memory::dim n, memory::dim c, memory::dim od, memory::dim oh, memory::dim ow) { @@ -150,8 +165,11 @@ void check_pool_fwd(const pool_test_params_t &p, const memory &src, const data_t out_ref = (data_t)acc_ref; ASSERT_NEAR(out, out_ref, 1e-6); - if (p.aalgorithm == algorithm::pooling_max - && p.aprop_kind == prop_kind::forward_training) { + // The workspace layout is different when the cuDNN backend is used + // and therefore this check must be skipped + if ((p.aalgorithm == algorithm::pooling_max + && p.aprop_kind == prop_kind::forward_training) + && !is_cudnn_gpu) { ASSERT_EQ(out_index, out_ref_index) << " n = " << n << " c = " << c << " od = " << od << " oh = " << oh << " ow = " << ow; @@ -166,6 +184,14 @@ class pooling_test_t : public ::testing::TestWithParam { protected: void SetUp() override { p = ::testing::TestWithParam::GetParam(); + + SKIP_IF(unsupported_data_type(data_traits::data_type), + "Engine does not support this data type."); + SKIP_IF_CUDA(!cuda_check_format_tags(p.src_format), + "Unsupported format tag"); + SKIP_IF_CUDA(!cuda_check_format_tags(p.dst_format), + "Unsupported format tag"); + catch_expected_failures( [=]() { Test(); }, p.expect_to_fail, p.expected_status); } @@ -234,6 +260,18 @@ class pooling_test_t : public ::testing::TestWithParam { memory workspace; + for (size_t i = 0; i < pad_l.size(); ++i) { + SKIP_IF_CUDA( + (p.aalgorithm + == dnnl::algorithm::pooling_avg_include_padding) + && (pad_l[i] < pad_r[i]), + "Asymmetric padding is not supported!"); + } + + for (size_t i = 0; i < dilation.size(); ++i) { + SKIP_IF_CUDA(dilation[i] != 0, "Dilation is not supported!"); + } + if (pd.dd == 0 && pd.dh == 0 && pd.dw == 0) { auto pool_desc = pooling_forward::desc(p.aprop_kind, p.aalgorithm, p_src_desc, p_dst_desc, strides, ker, pad_l, pad_r); diff --git a/tests/gtests/test_reorder_common.hpp b/tests/gtests/test_reorder_common.hpp index 4df9dd8b403..fa3728eef2f 100644 --- a/tests/gtests/test_reorder_common.hpp +++ b/tests/gtests/test_reorder_common.hpp @@ -78,6 +78,12 @@ class reorder_simple_test test_simple_params p = ::testing::TestWithParam::GetParam(); + SKIP_IF_CUDA(!((supported_format(p.fmt_i) + || supported_blocking(prec_i, p.fmt_i)) + && (supported_format(p.fmt_o) + || supported_blocking(prec_o, p.fmt_o))), + "Unsupported cuda format tag/ data type"); + catch_expected_failures( [=]() { engine eng = get_test_engine(); @@ -86,6 +92,19 @@ class reorder_simple_test p.expect_to_fail, p.expected_status); } #endif + bool supported_format(memory::format_tag fmt) { + return impl::utils::one_of(fmt, memory::format_tag::abcde, + memory::format_tag::acdeb, memory::format_tag::abcd, + memory::format_tag::acdb, memory::format_tag::abc, + memory::format_tag::acb, memory::format_tag::ab, + memory::format_tag::ba, memory::format_tag::a, + memory::format_tag::any); + } + + bool supported_blocking(memory::data_type dt, memory::format_tag fmt) { + return (dt == dnnl_u8 + && impl::utils::one_of(fmt, dnnl_aBcd4b, dnnl_aBcde4b)); + } void Test(engine &eng_i, engine &eng_o) { using data_i_t = typename reorder_types::first_type; @@ -101,6 +120,14 @@ class reorder_simple_test test_simple_params p = ::testing::TestWithParam::GetParam(); +#ifdef DNNL_SYCL_CUDA + SKIP_IF(!((supported_format(p.fmt_i) + || supported_blocking(prec_i, p.fmt_i)) + && (supported_format(p.fmt_o) + || supported_blocking(prec_o, p.fmt_o))), + "Unsupported cuda format tag/ data type"); +#endif + catch_expected_failures([&]() { RunTest(eng_i, eng_o); }, p.expect_to_fail, p.expected_status); } diff --git a/tests/gtests/test_resampling.cpp b/tests/gtests/test_resampling.cpp index df91e67df7c..2ea21863bbc 100644 --- a/tests/gtests/test_resampling.cpp +++ b/tests/gtests/test_resampling.cpp @@ -215,8 +215,19 @@ class resampling_test_t stream strm; protected: + bool cuda_supported_format_tag(memory::format_tag tag) { + return impl::utils::one_of( + tag, dnnl_abc, dnnl_abcd, dnnl_acb, dnnl_acdb); + } void SetUp() override { p = ::testing::TestWithParam::GetParam(); + SKIP_IF_CUDA(p.aalgorithm == algorithm::resampling_nearest, + "nearet algorithm is not supported for cudnn backend"); + SKIP_IF_CUDA(p.ndims == 5, + "cudnn resampling backend does not support 5d tensor"); + SKIP_IF_CUDA(!cuda_supported_format_tag(p.src_format), + "Unsupported format tag"); + catch_expected_failures( [=]() { Test(); }, p.expect_to_fail, p.expected_status); } diff --git a/tests/gtests/test_shuffle.cpp b/tests/gtests/test_shuffle.cpp index c2e71f6c49d..5293073c7b1 100644 --- a/tests/gtests/test_shuffle.cpp +++ b/tests/gtests/test_shuffle.cpp @@ -90,6 +90,7 @@ class shuffle_test_t : public ::testing::TestWithParam { protected: void SetUp() override { + SKIP_IF_CUDA(true, "Shuffle primitive not supported by CUDA"); data_type = data_traits::data_type; SKIP_IF(data_type == memory::data_type::f16 && get_test_engine_kind() == engine::kind::cpu, diff --git a/tests/gtests/test_softmax.cpp b/tests/gtests/test_softmax.cpp index cd9c3d53aba..c98b46c7197 100644 --- a/tests/gtests/test_softmax.cpp +++ b/tests/gtests/test_softmax.cpp @@ -45,9 +45,21 @@ class softmax_test_t protected: void SetUp() override { p = ::testing::TestWithParam>::GetParam(); + + SKIP_IF_CUDA(!cuda_check_format_tag(p.memory_format), + "Unsupported format tag"); + SKIP_IF_CUDA(!cuda_check_format_tag(p.diff_memory_format), + "Unsupported format tag"); + SKIP_IF_CUDA(data_traits::data_type == memory::data_type::bf16, + "Unsupported datatype for CUDA"); + catch_expected_failures( [=]() { Test(); }, p.expect_to_fail, p.expected_status); } + bool cuda_check_format_tag(memory::format_tag tag) { + return (tag != memory::format_tag::aBcd8b + && tag != memory::format_tag::aBc16b); + } void Forward() { // softmax specific types and values diff --git a/tests/gtests/test_sum.cpp b/tests/gtests/test_sum.cpp index fbddd4d62b5..76a9b885d5c 100644 --- a/tests/gtests/test_sum.cpp +++ b/tests/gtests/test_sum.cpp @@ -47,7 +47,7 @@ TEST_F(iface_sum_test_t, SumTestDstDataTypeCompliance) { for_(tag dst_tag : {tag::any, tag::abcd, tag::acdb}) for (dt dst_dt : {dt::undef, dt::s8, dt::s32, dt::f32}) { sum::primitive_desc sum_pd; - + SKIP_FOR_LOOP_CUDA(dst_dt == dt::s32, "Unsupported data_type"); if (dst_dt != dt::undef) { memory::desc dst_md(shape, dst_dt, dst_tag); sum_pd = sum::primitive_desc( @@ -132,6 +132,14 @@ class sum_test_t : public ::testing::TestWithParam { } protected: + bool cuda_supported_format_tag(memory::format_tag tag) { + return impl::utils::one_of(tag, dnnl_a, dnnl_ab, dnnl_abc, dnnl_abcd, + dnnl_abcde, dnnl_abcdef, dnnl_abdec, dnnl_acb, dnnl_acbde, + dnnl_acbdef, dnnl_acdb, dnnl_acdeb, dnnl_ba, dnnl_bac, + dnnl_bacd, dnnl_bca, dnnl_bcda, dnnl_bcdea, dnnl_cba, dnnl_cdba, + dnnl_cdeba, dnnl_decab, dnnl_defcab, dnnl_aBc4b, dnnl_aBcd4b, + dnnl_aBcde4b); + } void SetUp() override { src_data_type = data_traits::data_type; dst_data_type = data_traits::data_type; @@ -142,6 +150,15 @@ class sum_test_t : public ::testing::TestWithParam { "GPU does not support bfloat16 data type."); SKIP_IF(unsupported_data_type(src_data_type), "Engine does not support this data type."); + SKIP_IF(unsupported_data_type(dst_data_type), + "Engine does not support this data type."); + + SKIP_IF_CUDA(!cuda_supported_format_tag(p.dst_format), + "Unsupported format tag"); + for (int i = 0; i < p.srcs_format.size(); i++) { + SKIP_IF_CUDA(!cuda_supported_format_tag(p.srcs_format[i]), + "Unsupported format tag"); + } catch_expected_failures( [=]() { Test(); }, p.expect_to_fail, p.expected_status); }