bytedance · neopro12 · Jun 8, 2022 · Apr 25, 2022 · Jun 7, 2022 · Jun 7, 2022
@@ -0,0 +1,19 @@
+#include "context.h"
+
+Context::Context() : _stream(nullptr) {
+  CHECK_GPU_ERROR(cublasCreate(&_cublasHandle));
+}
+
+Context &Context::Instance() {
+  static Context _ctx;
+  return _ctx;
+}
+
+void Context::set_stream(cudaStream_t stream) {
+  _stream = stream;
+  CHECK_GPU_ERROR(cublasSetStream(_cublasHandle, _stream));
+}
+
+cudaStream_t Context::get_stream() { return _stream; }
+
+cublasHandle_t Context::get_cublashandle() { return _cublasHandle; }
@@ -0,0 +1,90 @@
+#include "dropout.h"
+#include <cuda.h>
+#include <cuda_fp16.h>
+
+template <typename T>
+Dropout<T>::Dropout(const Dropout<T>::Config &config, size_t max_ele_num)
+    : _config(config), _mask(nullptr) {
+  _mask = cuda_malloc<uint8_t>(max_ele_num);
+}
+
+template <typename T>
+Dropout<T>::~Dropout() {
+  cuda_free(_mask);
+}
+
+template <typename T>
+void Dropout<T>::dropout(T *output, const T *input, int count,
+                         cudaStream_t stream, bool bwd) {
+  launch_ls_dropout<T>(output, input, _mask, count, _config.RATIO(), stream,
+                       bwd);
+}
+
+template <typename T>
+void Dropout<T>::d_dropout(T *d_inp_out, int count, cudaStream_t stream) {
+  launch_ls_dropout<T>(d_inp_out, d_inp_out, _mask, count, _config.RATIO(),
+                       stream, true);
+}
+
+template <typename T>
+void Dropout<T>::bias_dropout_residual(T *output, const T *input,
+                                       const T *residual, const T *bias,
+                                       int rows, int cols,
+                                       cudaStream_t stream) {
+  launch_ls_dropout_res_bias<T>(output, input, _mask, bias, residual,
+                                rows * cols, cols, _config.RATIO(), stream);
+}
+
+template <typename T>
+void Dropout<T>::d_bias_dropout_residual(T *d_input, T *d_bias,
+                                         const T *d_output, int rows, int cols,
+                                         cudaStream_t stream) {
+  launch_ls_dropout_bias_bwd<T>(d_input, d_bias, d_output, _mask, rows, cols,
+                                _config.RATIO(), stream);
+}
+
+template <typename T>
+void Dropout<T>::bias_act_dropout(T *output, const T *input, const T *bias,
+                                  int rows, int cols, std::string activation_fn,
+                                  cudaStream_t stream) {
+  if (activation_fn == "relu") {
+    launch_ls_dropout_act_bias<ActivationType::kRelu, T>(
+        output, input, _mask, bias, rows * cols, cols, _config.RATIO(), stream);
+  } else if (activation_fn == "gelu") {
+    launch_ls_dropout_act_bias<ActivationType::kGelu, T>(
+        output, input, _mask, bias, rows * cols, cols, _config.RATIO(), stream);
+  } else {
+    throw std::runtime_error("not supported activation: " + activation_fn);
+  }
+}
+
+template <typename T>
+void Dropout<T>::d_bias_act_dropout(T *d_inp_out, T *d_bias_out, const T *input,
+                                    const T *bias, int rows, int cols,
+                                    std::string activation_fn,
+                                    cudaStream_t stream) {
+  if (activation_fn == "relu") {
+    launch_ls_dropout_act_bias_bwd<ActivationType::kRelu, T>(
+        d_inp_out, d_bias_out, input, bias, d_inp_out, _mask, rows, cols,
+        _config.RATIO(), stream);
+  } else if (activation_fn == "gelu") {
+    launch_ls_dropout_act_bias_bwd<ActivationType::kGelu, T>(
+        d_inp_out, d_bias_out, input, bias, d_inp_out, _mask, rows, cols,
+        _config.RATIO(), stream);
+  } else {
+    throw std::runtime_error("not supported activation: " + activation_fn);
+  }
+}
+
+template <typename T>
+bool Dropout<T>::HasDropout() const {
+  return _config.RATIO() > 0.0;
+}
+
+template <typename T>
+void Dropout<T>::SetTrainingMode(bool training) {
+  _config.training = training;
+}
+
+template class Dropout<float>;
+template class Dropout<__half>;
@@ -0,0 +1,41 @@
+#include "feed_forward.h"
+
+template <typename T>
+void FeedForward<T>::Forward(int bsz, const T *input_ptr, const T *weights,
+                             T *out, cublasHandle_t &_cublasHandle) {
+  float alpha = T(1.);
+  float beta = T(0.);
+
+  cublas_gemm_ex(_cublasHandle, CUBLAS_OP_T, CUBLAS_OP_N, config_.outputSize,
+                 bsz, config_.inputSize, &alpha, &beta, weights, input_ptr, out,
+                 cublasGemmAlgo_t(config_.gemm_algos[0]));
+}
+
+template <typename T>
+void FeedForward<T>::Backward(int bsz, const T *out_grad, const T *input_ptr,
+                              const T *weights, T *weights_grad, T *bias_grad,
+                              cublasHandle_t &_cublasHandle,
+                              cudaStream_t &stream, T *inp_grad_out,
+                              T *out_grad_trans_out, bool compute_bias) {
+  float alpha = (T)1.0, beta = (T)0.0;
+  cublas_gemm_ex(_cublasHandle, CUBLAS_OP_N, CUBLAS_OP_T, config_.inputSize,
+                 config_.outputSize, bsz, &alpha, &beta, input_ptr, out_grad,
+                 weights_grad, cublasGemmAlgo_t(config_.gemm_algos[1]));
+
+  cublas_gemm_ex(_cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, config_.inputSize,
+                 bsz, config_.outputSize, &alpha, &beta, weights, out_grad,
+                 inp_grad_out, cublasGemmAlgo_t(config_.gemm_algos[2]));
+  if (compute_bias) {
+    launch_fuse_transpose_bias_kernel<T>(out_grad, bias_grad, bsz,
+                                         config_.outputSize, stream);
+  }
+}
+
+template <typename T>
+void FeedForward<T>::reset_size(int outputSize, int inputSize) {
+  config_.outputSize = outputSize;
+  config_.inputSize = inputSize;
+}
+
+template class FeedForward<float>;
+template class FeedForward<__half>;
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <cublas_v2.h>
+#include <cuda.h>
+
+#include <iostream>
+#include <string>
+
+#include "cuda_util.h"
+
+class Context {
+ public:
+  Context();
+  virtual ~Context() {}
+  static Context &Instance();
+  void set_stream(cudaStream_t stream);
+  cudaStream_t get_stream();
+  cublasHandle_t get_cublashandle();
+
+ private:
+  cudaStream_t _stream;
+  cublasHandle_t _cublasHandle;
+};
@@ -0,0 +1,56 @@
+#pragma once
+
+#include <string>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <stdio.h>
+#include "cuda_util.h"
+#include "kernels.h"
+
+template <typename T>
+class Dropout {
+ public:
+  struct Config {
+    float ratio;
+    bool training;
+
+    Config(float r) : ratio(r), training(true) {}
+    float RATIO() const { return training ? ratio : 0.0; }
+  };
+
+  Dropout(const Config &config, size_t max_ele_num);
+
+  virtual ~Dropout();
+
+  // after attention softmax
+  void dropout(T *output, const T *input, int count, cudaStream_t stream,
+               bool bwd = false);
+
+  void d_dropout(T *d_inp_out, int count, cudaStream_t stream);
+
+  // transformer layer's postprocessing dropout, after attn or ffn module,
+  // before residual add.
+  void bias_dropout_residual(T *output, const T *input, const T *residual,
+                             const T *bias, int rows, int cols,
+                             cudaStream_t stream);
+
+  void d_bias_dropout_residual(T *d_input, T *d_bias, const T *d_output,
+                               int rows, int cols, cudaStream_t stream);
+
+  // dropout inside ffn.
+  void bias_act_dropout(T *output, const T *input, const T *bias, int rows,
+                        int cols, std::string activation_fn,
+                        cudaStream_t stream);
+
+  void d_bias_act_dropout(T *d_inp_out, T *d_bias_out, const T *input,
+                          const T *bias, int rows, int cols,
+                          std::string activation_fn, cudaStream_t stream);
+
+  bool HasDropout() const;
+
+  void SetTrainingMode(bool training);
+
+ private:
+  uint8_t *_mask;
+  Config _config;
+};
@@ -0,0 +1,45 @@
+#pragma once
+
+/* Copyright 2021 The LightSeq Team
+   Copyright Microsoft DeepSpeed
+   This file is adapted from Microsoft DeepSpeed
+*/
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <stdio.h>
+
+#include <array>
+
+#include "cublas_wrappers.h"
+#include "kernels.h"
+
+template <typename T>
+class FeedForward {
+ public:
+  struct Config {
+    int outputSize;
+    int inputSize;
+    std::array<int, 3> gemm_algos;
+    Config(int outputs, int inputs)
+        : outputSize(outputs),
+          inputSize(inputs),
+          gemm_algos(std::array<int, 3>({99, 99, 99})) {}
+  };
+  FeedForward(Config config) : config_(config) {}
+
+  ~FeedForward() {}
+
+  void Forward(int bsz, const T *input_ptr, const T *weights, T *out,
+               cublasHandle_t &_cublasHandle);
+
+  void Backward(int bsz, const T *out_grad, const T *input_ptr,
+                const T *weights, T *weights_grad, T *bias_grad,
+                cublasHandle_t &_cublasHandle, cudaStream_t &stream,
+                T *inp_grad_out = nullptr, T *out_grad_trans_out = nullptr,
+                bool compute_bias = true);
+
+  void reset_size(int outputSize, int inputSize);
+
+ private:
+  Config config_;
+};
@@ -5,7 +5,7 @@
 #include <stdio.h>
 
 #include <fstream>
-
+#include "cuda_util.h"
 #include "kernels.h"
 
 using namespace std;
@@ -20,24 +20,12 @@ class Normalize_Layer {
         : hidden_dim(hidden_dim), use_mean(use_mean) {}
   };
 
-  Normalize_Layer(Config config, size_t max_rows)
-      : config_(config), vars_(nullptr), means_(nullptr) {
-    vars_ = cuda_malloc<T>(max_rows);
-    if (config_.use_mean) {
-      means_ = cuda_malloc<T>(max_rows);
-    }
-  }
+  Normalize_Layer(Config config, size_t max_rows);
 
-  ~Normalize_Layer() {
-    cuda_free(vars_);
-    cuda_free(means_);
-  }
+  ~Normalize_Layer();
 
   void Forward(T *ln_res, const T *inp, const T *gamma, const T *betta,
-               int batch_size, cudaStream_t stream) {
-    launch_layer_norm(ln_res, vars_, means_, inp, gamma, betta, batch_size,
-                      config_.hidden_dim, stream);
-  }
+               int batch_size, cudaStream_t stream);
 
   /*
   residual_grad, inp_or_out, betta should be treated carefully.
@@ -50,13 +38,9 @@ class Normalize_Layer {
   */
   void Backward(T *gamma_grad, T *betta_grad, T *inp_grad, const T *out_grad,
                 const T *residual_grad, const T *inp_or_out, const T *gamma,
-                const T *betta, int batch_size, cudaStream_t stream[2]) {
-    launch_ln_bw(gamma_grad, betta_grad, inp_grad, out_grad, residual_grad,
-                 inp_or_out, gamma, betta, vars_, means_, batch_size,
-                 config_.hidden_dim, stream);
-  }
+                const T *betta, int batch_size, cudaStream_t stream[2]);
 
-  inline bool use_mean() const { return config_.use_mean; }
+  bool use_mean() const;
 
  private:
   Config config_;

@@ -25,17 +25,10 @@ class Softmax {
   ~Softmax() {}
 
   void Forward(T *vals, const T *attn_mask, int batch_size, int from_len,
-               int to_len, cudaStream_t &stream, bool mask_future = false) {
-    launch_attn_softmax<T>(vals, attn_mask, batch_size, config_.nhead, from_len,
-                           to_len, config_.mask_future | mask_future, stream);
-  }
+               int to_len, cudaStream_t &stream, bool mask_future = false);
 
   void Backward(T *out_grad, const T *soft_out, int batch_size, int from_len,
-                int to_len, cudaStream_t stream) {
-    launch_attn_softmax_bw<T>(out_grad, soft_out,
-                              batch_size * config_.nhead * from_len, to_len,
-                              stream);
-  }
+                int to_len, cudaStream_t stream);
 
  private:
   Config config_;