diff --git a/README.md b/README.md
index e351df87..f5a5a20b 100644
--- a/README.md
+++ b/README.md
@@ -41,7 +41,7 @@ The following is a support matrix of LightSeq **inference** library compared wit
## Performance
### [>>> Training](./lightseq/training)
-Here we present the experimental results on WMT14 English to German translation task based on Transformer-big models. We train Transformer models of different sizes on eight NVIDIA Tesla V100/NVIDIA Ampere A100 GPUs with data parallel and fp16 mixed precision.
+Here we present the experimental results on WMT14 English to German translation task based on Transformer-big models. We train Transformer models of different sizes on eight NVIDIA Tesla V100/NVIDIA Tesla A100 GPUs with data parallel and fp16 mixed precision.
[Fairseq](https://github.com/pytorch/fairseq) with [Apex](https://github.com/NVIDIA/apex) is choosed as our baseline.
@@ -66,6 +66,20 @@ More results is available [here](./docs/inference/performance.md).
## Quick Start
Complete user guide is available [here](docs/guide.md).
+### Installation
+You can install LightSeq from PyPI:
+```shell
+$ pip install lightseq
+```
+
+LightSeq installation from PyPI only supports Python 3.6 to 3.8 on Linux for now. Consider compiling from source if you have other environments:
+```shell
+$ PATH=/usr/local/hdf5/:$PATH ENABLE_FP32=0 ENABLE_DEBUG=0 pip install -e $PROJECT_DIR
+```
+
+Detailed building introduction is available [here](docs/inference/build.md).
+
+
### Fast training from Fairseq
You can experience lightning fast training by running following commands,
@@ -97,12 +111,10 @@ $ cd examples/inference/python
then you can check the performance by simply running following commands. `hf_bart_export.py` is used to transform pytorch weights to LightSeq protobuffer.
```shell
-$ python export/hf_bart_export.py
+$ python export/huggingface/hf_bart_export.py
$ python test/ls_bart.py
```
-LightSeq installation from pypi only supports python 3.6 to 3.8 on Linux for now. Consider compiling from source if you have other environments.
-
More usage is available [here](./lightseq/inference/README.md).
### Fast deploy inference server
diff --git a/docker/README.md b/docker/README.md
index f29df5c6..375f5f4e 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -1,5 +1,5 @@
## Dockerfiles of lightseq
-Pypi: for publish python package.
+PyPI: for publish python package.
Tritonserver: for publish tritonserver
diff --git a/docs/guide.md b/docs/guide.md
index 1fd427c3..651cc616 100644
--- a/docs/guide.md
+++ b/docs/guide.md
@@ -119,7 +119,7 @@ These functions can export the configuration, embedding, encoder and decoder wei
LightSeq provides export examples of native Hugging Face BERT/BART/GPT2, Fairseq trained with LightSeq and LightSeq Transformer. All codes are available [here](../examples/inference/python/export).
#### Fairseq
-The main code is as follows (some parameters are omitted). Complete code is available [here](../examples/inference/python/export/ls_fs_transformer_export.py).
+The main code is as follows (some parameters are omitted). Complete code is available [here](../examples/inference/python/export/fairseq/ls_fs_transformer_export.py).
```python
model = Transformer()
encoder_state_dict, decoder_state_dict = _extract_weight(state_dict)
@@ -136,7 +136,7 @@ First, you need to divide the state dict into two parts of encoder and decoder,
The above functions export the checkpoints to protobuf by default. Specify `save_pb=False` to export to hdf5 files. You can use the [Fairseq training example](../examples/training/fairseq) to obtain the trained checkpoints.
#### Hugging Face
-LightSeq provides three examples of exporting native Hugging Face models ([BERT](../examples/inference/python/export/hf_bert_export.py), [BART](../examples/inference/python/export/hf_bart_export.py) and [GPT2](../examples/inference/python/export/hf_gpt2_export.py)). Because these native models did not use LightSeq modules to pretrain, the users must manually make the export rules.
+LightSeq provides three examples of exporting native Hugging Face models ([BERT](../examples/inference/python/export/huggingface/hf_bert_export.py), [BART](../examples/inference/python/export/huggingface/hf_bart_export.py) and [GPT2](../examples/inference/python/export/huggingface/hf_gpt2_export.py)). Because these native models did not use LightSeq modules to pretrain, the users must manually make the export rules.
#### LightSeq Transformer
LightSeq provide an example of exporting its own Transformer module, which is similar to Fairseq models export. You can use the [custom training example](../examples/training/custom) to obtain the trained checkpoints. This export example can also compare the results and speeds of forward propagation in training library, inference library loading both protobuf and hdf5 files. The results show that the inference library is faster than the forward propagation of training library by about 2x.
diff --git a/docs/training/images/single_step.png b/docs/training/images/single_step.png
index aae28f40..ea79e34c 100644
Binary files a/docs/training/images/single_step.png and b/docs/training/images/single_step.png differ
diff --git a/examples/inference/cpp/CMakeLists.txt b/examples/inference/cpp/CMakeLists.txt
index 64cec769..dbf92330 100644
--- a/examples/inference/cpp/CMakeLists.txt
+++ b/examples/inference/cpp/CMakeLists.txt
@@ -3,11 +3,20 @@ cmake_minimum_required(VERSION 3.18)
add_executable(transformer_example transformer_example.cc)
target_link_libraries(transformer_example PUBLIC liblightseq)
+add_executable(quant_transformer_example quant_transformer_example.cc)
+target_link_libraries(quant_transformer_example PUBLIC liblightseq)
+
add_executable(bert_example bert_example.cc)
target_link_libraries(bert_example PUBLIC liblightseq)
+add_executable(quant_bert_example quant_bert_example.cc)
+target_link_libraries(quant_bert_example PUBLIC liblightseq)
+
add_executable(gpt_example gpt_example.cc)
target_link_libraries(gpt_example PUBLIC liblightseq)
+add_executable(quant_gpt_example quant_gpt_example.cc)
+target_link_libraries(quant_gpt_example PUBLIC liblightseq)
+
add_executable(transformer_decoder_example decoder_example.cc.cu)
target_link_libraries(transformer_decoder_example PUBLIC transformer_model)
diff --git a/examples/inference/cpp/bert_example.cc b/examples/inference/cpp/bert_example.cc
index cdec69a1..22c08bb7 100644
--- a/examples/inference/cpp/bert_example.cc
+++ b/examples/inference/cpp/bert_example.cc
@@ -8,15 +8,31 @@ Example of how to run Bert inference using our implementation.
int main(int argc, char* argv[]) {
std::string model_weights_path = argv[1];
+ std::vector example_input = {2859, 2758, 2051, 2157,
+ 2005, 6629, 7566, 1012};
+ int eg_seq_len = example_input.size();
int max_batch_size = 128;
+ int batch_size = 1;
+ int batch_seq_len = eg_seq_len;
+
+ if (argc == 4) {
+ batch_size = atoi(argv[2]);
+ batch_seq_len = atoi(argv[3]);
+ }
+ if (batch_size > max_batch_size) {
+ throw std::runtime_error("batch_size exceeds the maximum (128)!");
+ }
+
+ std::vector host_input;
+ for (int i = 0; i < batch_size; ++i) {
+ for (int j = 0; j < batch_seq_len; ++j) {
+ host_input.push_back(example_input[j % eg_seq_len]);
+ }
+ }
auto model = lightseq::cuda::LSModelFactory::GetInstance().CreateModel(
"Bert", model_weights_path, max_batch_size);
- int batch_size = 1;
- int batch_seq_len = 8;
- std::vector host_input = {101, 4931, 1010, 2129, 2024, 2017, 102, 0};
-
void* d_input;
lightseq::cuda::CHECK_GPU_ERROR(
cudaMalloc(&d_input, sizeof(int) * batch_size * batch_seq_len));
diff --git a/examples/inference/cpp/gpt_example.cc b/examples/inference/cpp/gpt_example.cc
index c1defe1a..bc07d90e 100644
--- a/examples/inference/cpp/gpt_example.cc
+++ b/examples/inference/cpp/gpt_example.cc
@@ -8,15 +8,30 @@ Example of how to run gpt inference using our implementation.
int main(int argc, char* argv[]) {
std::string model_weights_path = argv[1];
+ std::vector example_input = {40, 1842, 345, 11, 475, 345, 910, 326};
+ int eg_seq_len = example_input.size();
int max_batch_size = 128;
+ int batch_size = 1;
+ int batch_seq_len = eg_seq_len;
+
+ if (argc == 4) {
+ batch_size = atoi(argv[2]);
+ batch_seq_len = atoi(argv[3]);
+ }
+ if (batch_size > max_batch_size) {
+ throw std::runtime_error("batch_size exceeds the maximum (128)!");
+ }
+
+ std::vector host_input;
+ for (int i = 0; i < batch_size; ++i) {
+ for (int j = 0; j < batch_seq_len; ++j) {
+ host_input.push_back(example_input[j % eg_seq_len]);
+ }
+ }
auto model = lightseq::cuda::LSModelFactory::GetInstance().CreateModel(
"Gpt", model_weights_path, max_batch_size);
- int batch_size = 1;
- int batch_seq_len = 5;
- std::vector host_input = {3666, 1438, 318, 402, 11571};
-
void* d_input;
lightseq::cuda::CHECK_GPU_ERROR(
cudaMalloc(&d_input, sizeof(int) * batch_size * batch_seq_len));
@@ -58,7 +73,7 @@ int main(int argc, char* argv[]) {
}
std::cout << std::endl;
- lightseq::cuda::print_vec(d_output, "output", 5);
+ lightseq::cuda::print_vec(d_output, "output", 10);
}
return 0;
diff --git a/examples/inference/cpp/quant_bert_example.cc b/examples/inference/cpp/quant_bert_example.cc
new file mode 100644
index 00000000..54ff5c14
--- /dev/null
+++ b/examples/inference/cpp/quant_bert_example.cc
@@ -0,0 +1,81 @@
+#include "model_base.h"
+#include "util.h"
+
+/**
+@file
+Example of how to run QuantBert inference using our implementation.
+*/
+
+int main(int argc, char* argv[]) {
+ std::string model_weights_path = argv[1];
+ std::vector example_input = {2859, 2758, 2051, 2157,
+ 2005, 6629, 7566, 1012};
+ int eg_seq_len = example_input.size();
+ int max_batch_size = 128;
+ int batch_size = 1;
+ int batch_seq_len = eg_seq_len;
+
+ if (argc == 4) {
+ batch_size = atoi(argv[2]);
+ batch_seq_len = atoi(argv[3]);
+ }
+ if (batch_size > max_batch_size) {
+ throw std::runtime_error("batch_size exceeds the maximum (128)!");
+ }
+
+ std::vector host_input;
+ for (int i = 0; i < batch_size; ++i) {
+ for (int j = 0; j < batch_seq_len; ++j) {
+ host_input.push_back(example_input[j % eg_seq_len]);
+ }
+ }
+
+ auto model = lightseq::cuda::LSModelFactory::GetInstance().CreateModel(
+ "QuantBert", model_weights_path, max_batch_size);
+
+ void* d_input;
+ lightseq::cuda::CHECK_GPU_ERROR(
+ cudaMalloc(&d_input, sizeof(int) * batch_size * batch_seq_len));
+ lightseq::cuda::CHECK_GPU_ERROR(cudaMemcpy(
+ d_input, host_input.data(), sizeof(int) * batch_size * batch_seq_len,
+ cudaMemcpyHostToDevice));
+
+ model->set_input_ptr(0, d_input);
+ model->set_input_shape(0, {batch_size, batch_seq_len});
+
+ for (int i = 0; i < model->get_output_size(); i++) {
+ void* d_output;
+ std::vector shape = model->get_output_max_shape(i);
+ int total_size = 1;
+ for (int j = 0; j < shape.size(); j++) {
+ total_size *= shape[j];
+ }
+ lightseq::cuda::CHECK_GPU_ERROR(
+ cudaMalloc(&d_output, total_size * sizeof(int)));
+ model->set_output_ptr(i, d_output);
+ }
+ lightseq::cuda::CHECK_GPU_ERROR(cudaStreamSynchronize(0));
+ std::cout << "infer preprocessing finished" << std::endl;
+
+ /* ---step5. infer and log--- */
+ for (int i = 0; i < 10; i++) {
+ auto start = std::chrono::high_resolution_clock::now();
+ model->Infer();
+ lightseq::cuda::print_time_duration(start, "one infer time", 0);
+ }
+
+ for (int i = 0; i < model->get_output_size(); i++) {
+ const float* d_output;
+ d_output = static_cast(model->get_output_ptr(i));
+ std::vector shape = model->get_output_shape(i);
+ std::cout << "output shape: ";
+ for (int j = 0; j < shape.size(); j++) {
+ std::cout << shape[j] << " ";
+ }
+ std::cout << std::endl;
+
+ lightseq::cuda::print_vec(d_output, "output", 5);
+ }
+
+ return 0;
+}
diff --git a/examples/inference/cpp/quant_gpt_example.cc b/examples/inference/cpp/quant_gpt_example.cc
new file mode 100644
index 00000000..6a3dce42
--- /dev/null
+++ b/examples/inference/cpp/quant_gpt_example.cc
@@ -0,0 +1,80 @@
+#include "model_base.h"
+#include "gpt.h"
+
+/**
+@file
+Example of how to run gpt inference using our implementation.
+*/
+
+int main(int argc, char* argv[]) {
+ std::string model_weights_path = argv[1];
+ std::vector example_input = {40, 1842, 345, 11, 475, 345, 910, 326};
+ int eg_seq_len = example_input.size();
+ int max_batch_size = 128;
+ int batch_size = 1;
+ int batch_seq_len = eg_seq_len;
+
+ if (argc == 4) {
+ batch_size = atoi(argv[2]);
+ batch_seq_len = atoi(argv[3]);
+ }
+ if (batch_size > max_batch_size) {
+ throw std::runtime_error("batch_size exceeds the maximum (128)!");
+ }
+
+ std::vector host_input;
+ for (int i = 0; i < batch_size; ++i) {
+ for (int j = 0; j < batch_seq_len; ++j) {
+ host_input.push_back(example_input[j % eg_seq_len]);
+ }
+ }
+
+ auto model = lightseq::cuda::LSModelFactory::GetInstance().CreateModel(
+ "QuantGpt", model_weights_path, max_batch_size);
+
+ void* d_input;
+ lightseq::cuda::CHECK_GPU_ERROR(
+ cudaMalloc(&d_input, sizeof(int) * batch_size * batch_seq_len));
+ lightseq::cuda::CHECK_GPU_ERROR(cudaMemcpy(
+ d_input, host_input.data(), sizeof(int) * batch_size * batch_seq_len,
+ cudaMemcpyHostToDevice));
+
+ model->set_input_ptr(0, d_input);
+ model->set_input_shape(0, {batch_size, batch_seq_len});
+
+ for (int i = 0; i < model->get_output_size(); i++) {
+ void* d_output;
+ std::vector shape = model->get_output_max_shape(i);
+ int total_size = 1;
+ for (int j = 0; j < shape.size(); j++) {
+ total_size *= shape[j];
+ }
+ lightseq::cuda::CHECK_GPU_ERROR(
+ cudaMalloc(&d_output, total_size * sizeof(int)));
+ model->set_output_ptr(i, d_output);
+ }
+ lightseq::cuda::CHECK_GPU_ERROR(cudaStreamSynchronize(0));
+ std::cout << "infer preprocessing finished" << std::endl;
+
+ /* ---step5. infer and log--- */
+ for (int i = 0; i < 10; i++) {
+ auto start = std::chrono::high_resolution_clock::now();
+ model->Infer();
+ lightseq::cuda::print_time_duration(start, "one infer time", 0);
+ }
+
+ for (int i = 0; i < model->get_output_size(); i++) {
+ const int* d_output;
+ d_output = static_cast(model->get_output_ptr(i));
+ std::vector shape = model->get_output_shape(i);
+ std::cout << "output shape: ";
+ for (int j = 0; j < shape.size(); j++) {
+ std::cout << shape[j] << " ";
+ }
+ std::cout << std::endl;
+
+ lightseq::cuda::print_vec(d_output, "output", 10);
+ }
+
+ return 0;
+}
diff --git a/examples/inference/cpp/quant_transformer_example.cc b/examples/inference/cpp/quant_transformer_example.cc
new file mode 100644
index 00000000..4073b8a3
--- /dev/null
+++ b/examples/inference/cpp/quant_transformer_example.cc
@@ -0,0 +1,88 @@
+#include "model_base.h"
+#include "util.h"
+
+/**
+@file
+Example of how to run quantized transformer inference using our implementation.
+*/
+
+int main(int argc, char* argv[]) {
+ std::string model_weights_path = argv[1];
+
+ std::vector example_input = {63, 47, 65, 1507, 88, 74,
+ 10, 2057, 362, 9, 284, 6};
+ int eg_seq_len = example_input.size();
+ int max_batch_size = 128;
+ int batch_size = 1;
+ int batch_seq_len = eg_seq_len;
+
+ if (argc == 4) {
+ batch_size = atoi(argv[2]);
+ batch_seq_len = atoi(argv[3]);
+ }
+ if (batch_size > max_batch_size) {
+ throw std::runtime_error("batch_size exceeds the maximum (128)!");
+ }
+
+ std::vector host_input;
+ for (int i = 0; i < batch_size; ++i) {
+ for (int j = 0; j < batch_seq_len; ++j) {
+ host_input.push_back(example_input[j % eg_seq_len]);
+ }
+ }
+
+ auto model = lightseq::cuda::LSModelFactory::GetInstance().CreateModel(
+ "QuantTransformer", model_weights_path, max_batch_size);
+
+ void* d_input;
+ lightseq::cuda::CHECK_GPU_ERROR(
+ cudaMalloc(&d_input, sizeof(int) * batch_size * batch_seq_len));
+ lightseq::cuda::CHECK_GPU_ERROR(cudaMemcpy(
+ d_input, host_input.data(), sizeof(int) * batch_size * batch_seq_len,
+ cudaMemcpyHostToDevice));
+
+ model->set_input_ptr(0, d_input);
+ model->set_input_shape(0, {batch_size, batch_seq_len});
+
+ for (int i = 0; i < model->get_output_size(); i++) {
+ void* d_output;
+ std::vector shape = model->get_output_max_shape(i);
+ int total_size = 1;
+ for (int j = 0; j < shape.size(); j++) {
+ total_size *= shape[j];
+ }
+ lightseq::cuda::CHECK_GPU_ERROR(
+ cudaMalloc(&d_output, total_size * sizeof(int)));
+ model->set_output_ptr(i, d_output);
+ }
+ lightseq::cuda::CHECK_GPU_ERROR(cudaStreamSynchronize(0));
+ std::cout << "infer preprocessing finished" << std::endl;
+
+ /* ---step5. infer and log--- */
+ for (int i = 0; i < 20; i++) {
+ auto start = std::chrono::high_resolution_clock::now();
+ model->Infer();
+ lightseq::cuda::print_time_duration(start, "one infer time", 0);
+ }
+
+ for (int i = 0; i < model->get_output_size(); i++) {
+ const void* d_output;
+ d_output = static_cast(model->get_output_ptr(i));
+ std::vector shape = model->get_output_shape(i);
+ std::cout << "output shape: ";
+ for (int j = 0; j < shape.size(); j++) {
+ std::cout << shape[j] << " ";
+ }
+ std::cout << std::endl;
+
+ if (!i)
+ lightseq::cuda::print_vec((int*)d_output, "output", 15);
+ else
+ lightseq::cuda::print_vec((float*)d_output, "output", 5);
+ }
+
+ // const int* res = model.get_result_ptr();
+ // const float* res_score = model.get_score_ptr();
+ // lightseq::cuda::print_vec(res_score, "res score", 5);
+ return 0;
+}
diff --git a/examples/inference/cpp/transformer_example.cc b/examples/inference/cpp/transformer_example.cc
index 6998064a..68f2f101 100644
--- a/examples/inference/cpp/transformer_example.cc
+++ b/examples/inference/cpp/transformer_example.cc
@@ -8,16 +8,32 @@ Example of how to run transformer inference using our implementation.
int main(int argc, char* argv[]) {
std::string model_weights_path = argv[1];
+
+ std::vector example_input = {63, 47, 65, 1507, 88, 74,
+ 10, 2057, 362, 9, 284, 6};
+ int eg_seq_len = example_input.size();
int max_batch_size = 128;
+ int batch_size = 1;
+ int batch_seq_len = eg_seq_len;
+
+ if (argc == 4) {
+ batch_size = atoi(argv[2]);
+ batch_seq_len = atoi(argv[3]);
+ }
+ if (batch_size > max_batch_size) {
+ throw std::runtime_error("batch_size exceeds the maximum (128)!");
+ }
+
+ std::vector host_input;
+ for (int i = 0; i < batch_size; ++i) {
+ for (int j = 0; j < batch_seq_len; ++j) {
+ host_input.push_back(example_input[j % eg_seq_len]);
+ }
+ }
auto model = lightseq::cuda::LSModelFactory::GetInstance().CreateModel(
"Transformer", model_weights_path, max_batch_size);
- int batch_size = 1;
- int batch_seq_len = 14;
- std::vector host_input = {0, 100, 657, 14, 1816, 6, 53,
- 50264, 473, 45, 50264, 162, 4, 2};
-
void* d_input;
lightseq::cuda::CHECK_GPU_ERROR(
cudaMalloc(&d_input, sizeof(int) * batch_size * batch_seq_len));
@@ -43,14 +59,14 @@ int main(int argc, char* argv[]) {
std::cout << "infer preprocessing finished" << std::endl;
/* ---step5. infer and log--- */
- for (int i = 0; i < 10; i++) {
+ for (int i = 0; i < 20; i++) {
auto start = std::chrono::high_resolution_clock::now();
model->Infer();
lightseq::cuda::print_time_duration(start, "one infer time", 0);
}
for (int i = 0; i < model->get_output_size(); i++) {
- const float* d_output;
+ const void* d_output;
d_output = static_cast(model->get_output_ptr(i));
std::vector shape = model->get_output_shape(i);
std::cout << "output shape: ";
@@ -59,7 +75,10 @@ int main(int argc, char* argv[]) {
}
std::cout << std::endl;
- lightseq::cuda::print_vec(d_output, "output", 5);
+ if (!i)
+ lightseq::cuda::print_vec((int*)d_output, "output", 15);
+ else
+ lightseq::cuda::print_vec((float*)d_output, "output", 5);
}
// const int* res = model.get_result_ptr();
diff --git a/examples/inference/python/README.md b/examples/inference/python/README.md
index 43c56aed..da721458 100644
--- a/examples/inference/python/README.md
+++ b/examples/inference/python/README.md
@@ -1,129 +1,62 @@
-# Examples of exporting models for LightSeq inference
-
-## Switch to the current directory
-```shell
-cd examples/inference/python
-```
-
-## Export models
-### Hugging Face
-1. Hugging Face BART
-
-Export Hugging Face BART models to protobuf/hdf5 format.
-```shell
-python export/huggingface/hf_bart_export.py
-```
-2. Hugging Face BERT
-
-Export Hugging Face BERT models to hdf5 format.
-```shell
-python export/huggingface/hf_bert_export.py
-```
-3. Hugging Face GPT2
-
-Export Hugging Face GPT2 models to hdf5 format.
-```shell
-python export/huggingface/hf_gpt2_export.py
-```
-4. Hugging Face ViT
-
-Export Hugging Face ViT models to hdf5 format.
-```shell
-python export/huggingface/hf_vit_export.py
-```
-### Native Fairseq
-1. Native Fairseq Transformer
-
-Export native Fairseq Transformer models to protobuf/hdf5 format. Refer to the `examples/training/fairseq` directory for more training details.
-```shell
-python export/fairseq/native_fs_transformer_export.py -m checkpoint_best.pt
-```
-
-2. Native Fairseq Transformer using PTQ
-
-Export native Fairseq Transformer models using PTQ to protobuf/hdf5 format. Refer to the `examples/training/fairseq` directory for more training details.
-```shell
-python export/fairseq/native_fs_transformer_export.py -m checkpoint_best.pt
-```
-
-3. Native Fairseq MoE Transformer
-
-Export Fairseq MoE models to protobuf/hdf5 format.
-```shell
-python export/fairseq/fs_moe_export.py
-```
-
-### Fairseq Transformer + LightSeq
-1. Fairseq Transformer using LightSeq training library
-
-Export Fairseq Transformer models training with LightSeq to protobuf/hdf5 format. Refer to the `examples/training/fairseq` directory for more training details.
-```shell
-python export/fairseq/ls_fs_transformer_export.py -m checkpoint_best.pt
-```
-
-2. Fairseq Transformer using LightSeq training library with PTQ
-
-Export Fairseq Transformer models training with LightSeq to protobuf format, and then using PTQ to speedup inference. Refer to the `examples/training/fairseq` directory for more training details.
-```shell
-python export/fairseq/ls_fs_transformer_ptq_export.py -m checkpoint_best.pt
-```
-
-### LightSeq Transformer
-
-1. LightSeq Transformer
-
-Export LightSeq Transformer models to protobuf/hdf5 format. Refer to the `examples/training/custom` directory for more training details.
-```shell
-python export/ls_transformer_export.py
-```
-2. LightSeq Transformer using PTQ
-
-Export LightSeq fp16/fp32 Transformer models to int8 protobuf format, and then using PTQ to speedup inference. Refer to the `examples/training/custom` directory for more training details. Note that in this example, we do not need to finetune the models using fake-quantization.
-```shell
-python export/ls_transformer_ptq_export.py
-```
-
-### Fairseq Transformer + custom Torch layers
-1. Fairseq Transformer using custom Torch layers
-
-Export Fairseq Transformer models training using custom Torch layers to protobuf/hdf5 format. Refer to the `examples/training/fairseq` directory for more training details.
-```shell
-python export/fairseq/ls_torch_fs_transformer_export.py -m checkpoint_best.pt
-```
-
-2. Fairseq Transformer using custom Torch layers and PTQ
-
-Export PTQ Fairseq Transformer models training using custom Torch layers to protobuf/hdf5 format. Refer to the `examples/training/fairseq` directory for more training details.
-```shell
-python export/fairseq/ls_torch_fs_transformer_ptq_export.py -m checkpoint_best.pt
-```
-
-3. Quantized Fairseq Transformer using custom Torch layers
-
-Export quantized Fairseq Transformer models training using custom Torch layers to protobuf/hdf5 format. Refer to the `examples/training/fairseq` directory for more training details.
-```shell
-python export/fairseq/ls_torch_fs_quant_transformer_export.py -m checkpoint_best.pt
-```
-
-## Inference using LightSeq
+# Model export and LightSeq inference
+This repo contains examples of exporting models (LightSeq, Fairseq based, Hugging Face, etc.) to protobuf/hdf5 format, and then use LightSeq for fast inference. For each model, we provide normal float model export, quantized model export (QAT, quantization aware training) and PTQ (post training quantization) model export.
+
+Before doing anything, you need to switch to the current directory:
+```shell
+$ cd examples/inference/python
+```
+
+## Model export
+We provide the following export examples. All Fairseq based models are trained using the scripts in [examples/training/fairseq](../../../examples/training/fairseq). The first two LightSeq Transformer models are trained using the scripts in [examples/training/custom](../../../examples/training/custom).
+
+| Model | Type | Command | Resource | Description |
+|----------------------------------------------|-------|-------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------|
+| LightSeq Transformer | Float | python export/ls_transformer_export.py -m ckpt_ls_custom.pt | [link](http://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/lightseq/example_model/ckpt_ls_custom.pt) | Export LightSeq Transformer models to protobuf format. |
+| LightSeq Transformer + PTQ | Int8 | python export/ls_transformer_ptq_export.py -m ckpt_ls_custom.pt | [link](http://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/lightseq/example_model/ckpt_ls_custom.pt) | Export LightSeq Transformer models to int8 protobuf format using post training quantization. |
+| Hugging Face BART | Float | python export/huggingface/hf_bart_export.py | / | Export Hugging Face BART models to protobuf/hdf5 format. |
+| Hugging Face BERT | Float | python export/huggingface/hf_bert_export.py | / | Export Hugging Face BERT models to hdf5 format. |
+| Hugging Face + custom Torch layer BERT + QAT | Int8 | python export/huggingface/ls_torch_hf_quant_bert_export.py -m ckpt_ls_torch_hf_quant_bert_ner.bin | / | Export Hugging Face BERT training with custom Torch layers to hdf5 format. |
+| Hugging Face GPT2 | Float | python export/huggingface/hf_gpt2_export.py | / | Export Hugging Face GPT2 models to hdf5 format. |
+| Hugging Face + custom Torch layer GPT2 + QAT | Int8 | python export/huggingface/ls_torch_hf_quant_gpt2_export.py -m ckpt_ls_torch_hf_quant_gpt2_ner.bin | / | Export Hugging Face GPT2 training with custom Torch layers to hdf5 format. |
+| Hugging Face ViT | Float | python export/huggingface/hf_vit_export.py | / | Export Hugging Face ViT models to hdf5 format. |
+| Native Fairseq Transformer | Float | python export/fairseq/native_fs_transformer_export.py -m ckpt_native_fairseq_31.06.pt | [link](http://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/lightseq/example_model/fairseq/ckpt_native_fairseq_31.06.pt) | Export native Fairseq Transformer models to protobuf/hdf5 format. |
+| Native Fairseq Transformer + PTQ | Int8 | python export/fairseq/native_fs_transformer_export.py -m ckpt_native_fairseq_31.06.pt | [link](http://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/lightseq/example_model/fairseq/ckpt_native_fairseq_31.06.pt) | Export native Fairseq Transformer models to int8 protobuf format using post training quantization. |
+| Fairseq + LightSeq Transformer | Float | python export/fairseq/ls_fs_transformer_export.py -m ckpt_ls_fairseq_31.17.pt | [link](http://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/lightseq/example_model/fairseq/ckpt_ls_fairseq_31.17.pt) | Export Fairseq Transformer models training with LightSeq modules to protobuf/hdf5 format. |
+| Fairseq + LightSeq Transformer + PTQ | Int8 | python export/fairseq/ls_fs_transformer_ptq_export.py -m ckpt_ls_fairseq_31.17.pt | [link](http://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/lightseq/example_model/fairseq/ckpt_ls_fairseq_31.17.pt) | Export Fairseq Transformer models training with LightSeq modules to int8 protobuf format using post training quantization. |
+| Fairseq + custom Torch layer | Float | python export/fairseq/ls_torch_fs_transformer_export.py -m ckpt_ls_torch_fairseq_31.16.pt | [link](http://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/lightseq/example_model/fairseq/ckpt_ls_torch_fairseq_31.16.pt) | Export Fairseq Transformer models training with custom Torch layers and other LightSeq modules to protobuf format. |
+| Fairseq + custom Torch layer + PTQ | Int8 | python export/fairseq/ls_torch_fs_transformer_ptq_export.py -m ckpt_ls_torch_fairseq_31.16.pt | [link](http://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/lightseq/example_model/fairseq/ckpt_ls_torch_fairseq_31.16.pt) | Export Fairseq Transformer models training with custom Torch layers and other LightSeq modules to int8 protobuf format using post training quantization. |
+| Fairseq + custom Torch layer + QAT | Int8 | python export/fairseq/ls_torch_fs_quant_transformer_export.py -m ckpt_ls_torch_fairseq_quant_31.09.pt | [link](http://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/lightseq/example_model/fairseq/ckpt_ls_torch_fairseq_quant_31.09.pt) | Export quantized Fairseq Transformer models training with custom Torch layers and other LightSeq modules to int8 protobuf format. |
+| Native Fairseq MoE Transformer | Float | python export/fairseq/native_fs_moe_transformer_export.py | / | Export Fairseq MoE Transformer models to protobuf/hdf5 format. |
+
+## LightSeq inference
+### Hugging Face models
1. BART
```shell
-python test/ls_bart.py
+$ python test/ls_bart.py
```
2. BERT
```shell
-python test/ls_bert.py
+$ python test/ls_bert.py
```
3. GPT2
```shell
-python test/ls_gpt2.py
+$ python test/ls_gpt2.py
```
4. ViT
```shell
-python test/ls_vit.py
+$ python test/ls_vit.py
+```
+5. Quantized BERT
+```shell
+$ python test/ls_quant_bert.py
+```
+6. Quantized GPT2
+```shell
+$ python test/ls_quant_gpt.py
```
-5. Fairseq based models using LightSeq inference
+### Fairseq based models
+After exporting the Fairseq based models to protobuf/hdf5 format using above scripts, we can use the following script for fast LightSeq inference on wmt14 en2de dateset, compatible with fp16 and int8 models:
```shell
-bash test/ls_fairseq.sh --model ${model_path}
+$ bash test/ls_fairseq.sh --model ${model_path}
```
diff --git a/examples/inference/python/export/fairseq/ls_fs_transformer_export.py b/examples/inference/python/export/fairseq/ls_fs_transformer_export.py
index 5993f79a..1b86e7d8 100644
--- a/examples/inference/python/export/fairseq/ls_fs_transformer_export.py
+++ b/examples/inference/python/export/fairseq/ls_fs_transformer_export.py
@@ -1,10 +1,8 @@
"""
-Export Fairseq Transformer models training with LightSeq to protobuf/hdf5 format.
+Export Fairseq Transformer models training with LightSeq modules to protobuf/hdf5 format.
Refer to the `examples/training/fairseq` directory for more training details.
"""
-import argparse
import torch
-import h5py
from export.proto.transformer_pb2 import Transformer
from lightseq.training import (
export_ls_config,
@@ -13,6 +11,7 @@
export_ls_decoder,
)
import lightseq.inference as lsi
+from export.util import parse_args, save_model
def _extract_weight(state_dict):
@@ -26,7 +25,7 @@ def _extract_weight(state_dict):
return encoder_state_dict, decoder_state_dict
-def export_fs_weights(file, state_dict, save_pb=True):
+def export_fs_weights(transformer, state_dict):
enc_norm_w = state_dict["encoder.layer_norm.weight"].flatten().tolist()
enc_norm_b = state_dict["encoder.layer_norm.bias"].flatten().tolist()
dec_norm_w = state_dict["decoder.layer_norm.weight"].flatten().tolist()
@@ -36,78 +35,52 @@ def export_fs_weights(file, state_dict, save_pb=True):
.flatten()
.tolist()
)
- if save_pb:
- file.src_embedding.norm_scale[:] = enc_norm_w
- file.src_embedding.norm_bias[:] = enc_norm_b
- file.trg_embedding.norm_scale[:] = dec_norm_w
- file.trg_embedding.norm_bias[:] = dec_norm_b
- file.trg_embedding.shared_bias[:] = dec_shared_b
- else:
- file.create_dataset("src_embedding/norm_scale", data=enc_norm_w, dtype="f4")
- file.create_dataset("src_embedding/norm_bias", data=enc_norm_b, dtype="f4")
- file.create_dataset("trg_embedding/norm_scale", data=dec_norm_w, dtype="f4")
- file.create_dataset("trg_embedding/norm_bias", data=dec_norm_b, dtype="f4")
- file.create_dataset("trg_embedding/shared_bias", data=dec_shared_b, dtype="f4")
+ transformer.src_embedding.norm_scale[:] = enc_norm_w
+ transformer.src_embedding.norm_bias[:] = enc_norm_b
+ transformer.trg_embedding.norm_scale[:] = dec_norm_w
+ transformer.trg_embedding.norm_bias[:] = dec_norm_b
+ transformer.trg_embedding.shared_bias[:] = dec_shared_b
-def export_ls_fs_transformer(ckpt_path, out_path, save_pb=True):
- with open(ckpt_path, "rb") as fin:
+def export_ls_fs_transformer(model_path, pb_path, hdf5_path, hdf5):
+ with open(model_path, "rb") as fin:
ckpt_file = torch.load(fin)
args = ckpt_file["args"]
state_dict = ckpt_file["model"]
- if save_pb:
- file = Transformer()
- else:
- file = h5py.File(out_path, "w")
+ transformer = Transformer()
encoder_state_dict, decoder_state_dict = _extract_weight(state_dict)
- export_ls_embedding(file, encoder_state_dict, 300, True, save_pb)
- export_ls_embedding(file, decoder_state_dict, 300, False, save_pb)
+ export_ls_embedding(transformer, encoder_state_dict, 300, True, save_pb=True)
+ export_ls_embedding(transformer, decoder_state_dict, 300, False, save_pb=True)
export_ls_encoder(
- file,
+ transformer,
encoder_state_dict,
args.encoder_embed_dim,
args.encoder_ffn_embed_dim,
- save_pb,
+ save_pb=True,
)
export_ls_decoder(
- file,
+ transformer,
decoder_state_dict,
args.decoder_embed_dim,
args.decoder_ffn_embed_dim,
args.decoder_layers,
- save_pb,
+ save_pb=True,
)
- export_fs_weights(file, state_dict, save_pb)
+ export_fs_weights(transformer, state_dict)
export_ls_config(
- file,
+ transformer,
args.encoder_attention_heads,
1,
2,
2,
args.encoder_layers,
args.decoder_layers,
- save_pb=save_pb,
+ save_pb=True,
)
- if save_pb:
- with open(out_path, "wb") as fout:
- fout.write(file.SerializeToString())
- else:
- file.close()
-
-
-def parse_args():
- parser = argparse.ArgumentParser(description="export fairseq checkpoint", usage="")
- parser.add_argument(
- "--model",
- "-m",
- type=str,
- default="checkpoint_best.pt",
- help="path of fairseq checkpoint",
- )
- args = parser.parse_args()
- return args
+ save_path = save_model(transformer, pb_path, hdf5_path, hdf5)
+ return save_path
if __name__ == "__main__":
@@ -115,15 +88,9 @@ def parse_args():
model_name = ".".join(args.model.split(".")[:-1])
pb_path = f"{model_name}.pb"
hdf5_path = f"{model_name}.hdf5"
- print("export to pb model >>>>>>")
- export_ls_fs_transformer(args.model, pb_path)
- print("export to hdf5 model >>>>>>")
- export_ls_fs_transformer(args.model, hdf5_path, save_pb=False)
+ path = export_ls_fs_transformer(args.model, pb_path, hdf5_path, args.hdf5)
src = [[63, 47, 65, 1507, 88, 74, 10, 2057, 362, 9, 284, 6, 2, 1, 1, 1]]
- pb_model = lsi.Transformer(pb_path, 8)
- pb_output = pb_model.infer(src)
- hdf5_model = lsi.Transformer(hdf5_path, 8)
- hdf5_output = hdf5_model.infer(src)
+ model = lsi.Transformer(path, 8)
+ output = model.infer(src)
# Expected result: [23, 550, 34, 118, 148, 2939, 4, 42, 32, 37, 6, 224, 10, 179, 5, 2]
- print("pb results:", pb_output)
- print("hdf5 results:", hdf5_output)
+ print("results:", output)
diff --git a/examples/inference/python/export/fairseq/ls_fs_transformer_ptq_export.py b/examples/inference/python/export/fairseq/ls_fs_transformer_ptq_export.py
index 2aeeba23..ae093990 100644
--- a/examples/inference/python/export/fairseq/ls_fs_transformer_ptq_export.py
+++ b/examples/inference/python/export/fairseq/ls_fs_transformer_ptq_export.py
@@ -1,11 +1,9 @@
"""
-Export Fairseq Transformer models training with LightSeq to protobuf format,
-and then using int8 quantization to speedup inference.
+Export Fairseq Transformer models training with LightSeq modules
+to int8 protobuf format using post training quantization.
Refer to the `examples/training/fairseq` directory for more training details.
"""
-import argparse
import torch
-import h5py
from export.proto.quant_transformer_pb2 import QuantTransformer
from lightseq.training import (
export_ls_config,
@@ -14,6 +12,7 @@
export_ls_decoder_ptq,
)
import lightseq.inference as lsi
+from export.util import parse_args, save_model
# adjust this value to achieve better performance
@@ -31,7 +30,7 @@ def _extract_weight(state_dict):
return encoder_state_dict, decoder_state_dict
-def export_fs_weights(file, state_dict, save_pb=True):
+def export_fs_weights(transformer, state_dict):
enc_norm_w = state_dict["encoder.layer_norm.weight"].flatten().tolist()
enc_norm_b = state_dict["encoder.layer_norm.bias"].flatten().tolist()
dec_norm_w = state_dict["decoder.layer_norm.weight"].flatten().tolist()
@@ -41,89 +40,76 @@ def export_fs_weights(file, state_dict, save_pb=True):
.flatten()
.tolist()
)
- file.src_embedding.norm_scale[:] = enc_norm_w
- file.src_embedding.norm_bias[:] = enc_norm_b
- file.trg_embedding.norm_scale[:] = dec_norm_w
- file.trg_embedding.norm_bias[:] = dec_norm_b
- file.trg_embedding.shared_bias[:] = dec_shared_b
+ transformer.src_embedding.norm_scale[:] = enc_norm_w
+ transformer.src_embedding.norm_bias[:] = enc_norm_b
+ transformer.trg_embedding.norm_scale[:] = dec_norm_w
+ transformer.trg_embedding.norm_bias[:] = dec_norm_b
+ transformer.trg_embedding.shared_bias[:] = dec_shared_b
-def export_ls_fs_transformer_ptq(ckpt_path, out_path, save_pb=True):
- with open(ckpt_path, "rb") as fin:
+def export_ls_fs_transformer_ptq(model_path, pb_path, hdf5_path, hdf5):
+ with open(model_path, "rb") as fin:
ckpt_file = torch.load(fin)
args = ckpt_file["args"]
state_dict = ckpt_file["model"]
- file = QuantTransformer()
+ transformer = QuantTransformer()
encoder_state_dict, decoder_state_dict = _extract_weight(state_dict)
export_ls_embedding_ptq(
- file,
+ transformer,
encoder_state_dict,
300,
True,
- save_pb=save_pb,
+ save_pb=True,
)
export_ls_embedding_ptq(
- file,
+ transformer,
decoder_state_dict,
300,
False,
- save_pb=save_pb,
+ save_pb=True,
)
export_ls_encoder_ptq(
- file,
+ transformer,
encoder_state_dict,
args.encoder_embed_dim,
args.encoder_ffn_embed_dim,
act_clip_max=global_act_clip_max,
- save_pb=save_pb,
+ save_pb=True,
)
export_ls_decoder_ptq(
- file,
+ transformer,
decoder_state_dict,
args.decoder_embed_dim,
args.decoder_ffn_embed_dim,
args.decoder_layers,
act_clip_max=global_act_clip_max,
- save_pb=save_pb,
+ save_pb=True,
)
- export_fs_weights(file, state_dict, save_pb)
+ export_fs_weights(transformer, state_dict)
export_ls_config(
- file,
+ transformer,
args.encoder_attention_heads,
1,
2,
2,
args.encoder_layers,
args.decoder_layers,
- save_pb=save_pb,
+ save_pb=True,
)
- with open(out_path, "wb") as fout:
- fout.write(file.SerializeToString())
-
-
-def parse_args():
- parser = argparse.ArgumentParser(description="export fairseq checkpoint", usage="")
- parser.add_argument(
- "--model",
- "-m",
- type=str,
- default="checkpoint_best.pt",
- help="path of fairseq checkpoint",
- )
- args = parser.parse_args()
- return args
+ save_path = save_model(transformer, pb_path, hdf5_path, hdf5)
+ return save_path
if __name__ == "__main__":
args = parse_args()
model_name = ".".join(args.model.split(".")[:-1])
pb_path = f"{model_name}_ptq.pb"
- print("export to pb model >>>>>>")
- export_ls_fs_transformer_ptq(args.model, pb_path)
+ hdf5_path = f"{model_name}_ptq.hdf5"
+ path = export_ls_fs_transformer_ptq(args.model, pb_path, hdf5_path, args.hdf5)
src = [[63, 47, 65, 1507, 88, 74, 10, 2057, 362, 9, 284, 6, 2, 1, 1, 1]]
- pb_model = lsi.QuantTransformer(pb_path, 8)
- pb_output = pb_model.infer(src)
- # FP16 result: [23, 550, 34, 118, 148, 2939, 4, 42, 32, 37, 6, 224, 10, 179, 5, 2]
- print("pb results:", pb_output)
+ model = lsi.QuantTransformer(path, 8)
+ output = model.infer(src)
+ # Expected result: [23, 550, 34, 118, 148, 2939, 4, 42, 32, 37, 6, 224, 10, 179, 5, 2]
+ print("results:", output)
diff --git a/examples/inference/python/export/fairseq/ls_torch_fs_quant_transformer_export.py b/examples/inference/python/export/fairseq/ls_torch_fs_quant_transformer_export.py
index 6a05cecb..f10abfcb 100644
--- a/examples/inference/python/export/fairseq/ls_torch_fs_quant_transformer_export.py
+++ b/examples/inference/python/export/fairseq/ls_torch_fs_quant_transformer_export.py
@@ -1,20 +1,20 @@
"""
-Export quantized Fairseq Transformer models training using custom Torch layers to protobuf/hdf5 format.
+Export quantized Fairseq Transformer models training with custom Torch layers
+and other LightSeq modules to int8 protobuf format.
Refer to the `examples/training/fairseq` directory for more training details.
"""
from collections import OrderedDict
-import argparse
import torch
-import tensorflow as tf
from export.proto.quant_transformer_pb2 import QuantTransformer
from lightseq.training.ops.pytorch.export import export_ls_config, apply_rule
-from lightseq.training.ops.pytorch.export_ptq import (
+from lightseq.training.ops.pytorch.export_quant import (
gather_quant_token_embedding,
quantize,
)
from lightseq.training.ops.pytorch.util import get_pos_embedding
import lightseq.inference as lsi
+from export.util import parse_args, save_model
enc_layer_mapping_dict = OrderedDict(
@@ -147,8 +147,10 @@ def fill_quant_pb_layer(tensor_names, state_dict, layer, mapping_dict):
def export_ls_torch_fs_quant_transformer(
- model_dir,
+ model_path,
pb_path,
+ hdf5_path,
+ hdf5,
max_step=300,
bos_id=2,
eos_id=2,
@@ -156,7 +158,7 @@ def export_ls_torch_fs_quant_transformer(
):
transformer = QuantTransformer()
# load var names
- reloaded = torch.load(model_dir, "cpu")
+ reloaded = torch.load(model_path, "cpu")
args = reloaded["args"]
model_dict = reloaded["model"]
@@ -304,31 +306,20 @@ def export_ls_torch_fs_quant_transformer(
save_pb=True,
)
- print("Writing to {0}".format(pb_path))
- with tf.io.gfile.GFile(pb_path, "wb") as fout:
- fout.write(transformer.SerializeToString())
-
-
-def parse_args():
- parser = argparse.ArgumentParser(description="export fairseq checkpoint", usage="")
- parser.add_argument(
- "--model",
- "-m",
- type=str,
- default="checkpoint_best.pt",
- help="path of fairseq checkpoint",
- )
- args = parser.parse_args()
- return args
+ save_path = save_model(transformer, pb_path, hdf5_path, hdf5)
+ return save_path
if __name__ == "__main__":
args = parse_args()
model_name = ".".join(args.model.split(".")[:-1])
pb_path = f"{model_name}.pb"
- export_ls_torch_fs_quant_transformer(args.model, pb_path)
+ hdf5_path = f"{model_name}.hdf5"
+ path = export_ls_torch_fs_quant_transformer(
+ args.model, pb_path, hdf5_path, args.hdf5
+ )
src = [[63, 47, 65, 1507, 88, 74, 10, 2057, 362, 9, 284, 6, 2, 1, 1, 1]]
- pb_model = lsi.QuantTransformer(pb_path, 8)
- pb_output = pb_model.infer(src)
+ model = lsi.QuantTransformer(path, 8)
+ output = model.infer(src)
# Expected result: [23, 550, 34, 118, 148, 2939, 4, 42, 32, 37, 6, 224, 10, 179, 5, 2]
- print("pb results:", pb_output)
+ print("results:", output)
diff --git a/examples/inference/python/export/fairseq/ls_torch_fs_transformer_export.py b/examples/inference/python/export/fairseq/ls_torch_fs_transformer_export.py
index 4f9d8267..ea223d53 100644
--- a/examples/inference/python/export/fairseq/ls_torch_fs_transformer_export.py
+++ b/examples/inference/python/export/fairseq/ls_torch_fs_transformer_export.py
@@ -1,12 +1,11 @@
"""
-Export Fairseq Transformer models training using custom Torch layers to protobuf/hdf5 format.
+Export Fairseq Transformer models training with custom Torch layers
+and other LightSeq modules to protobuf format.
Refer to the `examples/training/fairseq` directory for more training details.
"""
from collections import OrderedDict
-import argparse
import torch
-import tensorflow as tf
from export.proto.transformer_pb2 import Transformer
from lightseq.training.ops.pytorch.export import (
gather_token_embedding,
@@ -15,6 +14,7 @@
)
from lightseq.training.ops.pytorch.util import get_pos_embedding
import lightseq.inference as lsi
+from export.util import parse_args, save_model
enc_layer_mapping_dict = OrderedDict(
@@ -91,8 +91,10 @@ def _get_encode_output_mapping_dict(dec_layer_num):
def export_ls_torch_fs_transformer(
- model_dir,
+ model_path,
pb_path,
+ hdf5_path,
+ hdf5,
max_step=300,
bos_id=2,
eos_id=2,
@@ -100,7 +102,7 @@ def export_ls_torch_fs_transformer(
):
transformer = Transformer()
# load var names
- reloaded = torch.load(model_dir, "cpu")
+ reloaded = torch.load(model_path, "cpu")
args = reloaded["args"]
model_dict = reloaded["model"]
@@ -229,31 +231,18 @@ def export_ls_torch_fs_transformer(
save_pb=True,
)
- print("Writing to {0}".format(pb_path))
- with tf.io.gfile.GFile(pb_path, "wb") as fout:
- fout.write(transformer.SerializeToString())
-
-
-def parse_args():
- parser = argparse.ArgumentParser(description="export fairseq checkpoint", usage="")
- parser.add_argument(
- "--model",
- "-m",
- type=str,
- default="checkpoint_best.pt",
- help="path of fairseq checkpoint",
- )
- args = parser.parse_args()
- return args
+ save_path = save_model(transformer, pb_path, hdf5_path, hdf5)
+ return save_path
if __name__ == "__main__":
args = parse_args()
model_name = ".".join(args.model.split(".")[:-1])
pb_path = f"{model_name}.pb"
- export_ls_torch_fs_transformer(args.model, pb_path)
+ hdf5_path = f"{model_name}.hdf5"
+ path = export_ls_torch_fs_transformer(args.model, pb_path, hdf5_path, args.hdf5)
src = [[63, 47, 65, 1507, 88, 74, 10, 2057, 362, 9, 284, 6, 2, 1, 1, 1]]
- pb_model = lsi.Transformer(pb_path, 8)
- pb_output = pb_model.infer(src)
+ model = lsi.Transformer(path, 8)
+ output = model.infer(src)
# Expected result: [23, 550, 34, 118, 148, 2939, 4, 42, 32, 37, 6, 224, 10, 179, 5, 2]
- print("pb results:", pb_output)
+ print("results:", output)
diff --git a/examples/inference/python/export/fairseq/ls_torch_fs_transformer_ptq_export.py b/examples/inference/python/export/fairseq/ls_torch_fs_transformer_ptq_export.py
index c6498893..2ab259e9 100644
--- a/examples/inference/python/export/fairseq/ls_torch_fs_transformer_ptq_export.py
+++ b/examples/inference/python/export/fairseq/ls_torch_fs_transformer_ptq_export.py
@@ -1,20 +1,20 @@
"""
-Export PTQ Fairseq Transformer models training using custom Torch layers to protobuf/hdf5 format.
+Export Fairseq Transformer models training with custom Torch layers
+and other LightSeq modules to int8 protobuf format using post training quantization.
Refer to the `examples/training/fairseq` directory for more training details.
"""
from collections import OrderedDict
-import argparse
import torch
-import tensorflow as tf
from export.proto.quant_transformer_pb2 import QuantTransformer
from lightseq.training.ops.pytorch.export import export_ls_config
-from lightseq.training.ops.pytorch.export_ptq import (
+from lightseq.training.ops.pytorch.export_quant import (
gather_quant_token_embedding,
fill_quant_pb_layer,
)
from lightseq.training.ops.pytorch.util import get_pos_embedding
import lightseq.inference as lsi
+from export.util import parse_args, save_model
# adjust this value to achieve better performance
@@ -117,8 +117,10 @@ def _get_encode_output_mapping_dict(dec_layer_num):
def export_ls_torch_fs_transformer_ptq(
- model_dir,
+ model_path,
pb_path,
+ hdf5_path,
+ hdf5,
max_step=300,
bos_id=2,
eos_id=2,
@@ -126,7 +128,7 @@ def export_ls_torch_fs_transformer_ptq(
):
transformer = QuantTransformer()
# load var names
- reloaded = torch.load(model_dir, "cpu")
+ reloaded = torch.load(model_path, "cpu")
args = reloaded["args"]
model_dict = reloaded["model"]
@@ -266,31 +268,18 @@ def export_ls_torch_fs_transformer_ptq(
save_pb=True,
)
- print("Writing to {0}".format(pb_path))
- with tf.io.gfile.GFile(pb_path, "wb") as fout:
- fout.write(transformer.SerializeToString())
-
-
-def parse_args():
- parser = argparse.ArgumentParser(description="export fairseq checkpoint", usage="")
- parser.add_argument(
- "--model",
- "-m",
- type=str,
- default="checkpoint_best.pt",
- help="path of fairseq checkpoint",
- )
- args = parser.parse_args()
- return args
+ save_path = save_model(transformer, pb_path, hdf5_path, hdf5)
+ return save_path
if __name__ == "__main__":
args = parse_args()
model_name = ".".join(args.model.split(".")[:-1])
pb_path = f"{model_name}_ptq.pb"
- export_ls_torch_fs_transformer_ptq(args.model, pb_path)
+ hdf5_path = f"{model_name}_ptq.hdf5"
+ path = export_ls_torch_fs_transformer_ptq(args.model, pb_path, hdf5_path, args.hdf5)
src = [[63, 47, 65, 1507, 88, 74, 10, 2057, 362, 9, 284, 6, 2, 1, 1, 1]]
- pb_model = lsi.QuantTransformer(pb_path, 8)
- pb_output = pb_model.infer(src)
+ model = lsi.QuantTransformer(path, 8)
+ output = model.infer(src)
# Expected result: [23, 550, 34, 118, 148, 2939, 4, 42, 32, 37, 6, 224, 10, 179, 5, 2]
- print("pb results:", pb_output)
+ print("results:", output)
diff --git a/examples/inference/python/export/fairseq/native_fs_transformer_export.py b/examples/inference/python/export/fairseq/native_fs_transformer_export.py
index 0b77fd19..49e8aab8 100644
--- a/examples/inference/python/export/fairseq/native_fs_transformer_export.py
+++ b/examples/inference/python/export/fairseq/native_fs_transformer_export.py
@@ -3,20 +3,17 @@
Refer to the `examples/training/fairseq` directory for more training details.
"""
from collections import OrderedDict
-import argparse
import torch
-import tensorflow as tf
-import h5py
from export.proto.transformer_pb2 import Transformer
from lightseq.training.ops.pytorch.export import (
gather_token_embedding,
fill_pb_layer,
export_ls_config,
- export_pb2hdf5,
)
from lightseq.training.ops.pytorch.util import get_pos_embedding
import lightseq.inference as lsi
+from export.util import parse_args, save_model
enc_layer_mapping_dict = OrderedDict(
@@ -93,9 +90,10 @@ def _get_encode_output_mapping_dict(dec_layer_num):
def export_native_fs_transformer(
- model_dir,
+ model_path,
pb_path,
hdf5_path,
+ hdf5,
max_step=300,
bos_id=2,
eos_id=2,
@@ -103,7 +101,7 @@ def export_native_fs_transformer(
):
transformer = Transformer()
# load var names
- reloaded = torch.load(model_dir, "cpu")
+ reloaded = torch.load(model_path, "cpu")
args = reloaded["args"]
model_dict = reloaded["model"]
@@ -234,27 +232,8 @@ def export_native_fs_transformer(
save_pb=True,
)
- print("Writing to {0}".format(pb_path))
- with tf.io.gfile.GFile(pb_path, "wb") as fout:
- fout.write(transformer.SerializeToString())
-
- print("Writing to {0}".format(hdf5_path))
- f = h5py.File(hdf5_path, "w")
- export_pb2hdf5(transformer, f)
- f.close()
-
-
-def parse_args():
- parser = argparse.ArgumentParser(description="export fairseq checkpoint", usage="")
- parser.add_argument(
- "--model",
- "-m",
- type=str,
- default="checkpoint_best.pt",
- help="path of fairseq checkpoint",
- )
- args = parser.parse_args()
- return args
+ save_path = save_model(transformer, pb_path, hdf5_path, hdf5)
+ return save_path
if __name__ == "__main__":
@@ -262,9 +241,9 @@ def parse_args():
model_name = ".".join(args.model.split(".")[:-1])
pb_path = f"{model_name}.pb"
hdf5_path = f"{model_name}.hdf5"
- export_native_fs_transformer(args.model, pb_path, hdf5_path)
+ path = export_native_fs_transformer(args.model, pb_path, hdf5_path, args.hdf5)
src = [[63, 47, 65, 1507, 88, 74, 10, 2057, 362, 9, 284, 6, 2, 1, 1, 1]]
- pb_model = lsi.Transformer(pb_path, 8)
- pb_output = pb_model.infer(src)
+ model = lsi.Transformer(path, 8)
+ output = model.infer(src)
# Expected result: [23, 550, 34, 118, 148, 2939, 4, 42, 32, 37, 6, 224, 10, 179, 5, 2]
- print("pb results:", pb_output)
+ print("results:", output)
diff --git a/examples/inference/python/export/fairseq/native_fs_transformer_ptq_export.py b/examples/inference/python/export/fairseq/native_fs_transformer_ptq_export.py
index 446605f9..af704b6f 100644
--- a/examples/inference/python/export/fairseq/native_fs_transformer_ptq_export.py
+++ b/examples/inference/python/export/fairseq/native_fs_transformer_ptq_export.py
@@ -1,20 +1,19 @@
"""
-Export PTQ native Fairseq Transformer models to protobuf/hdf5 format.
+Export native Fairseq Transformer models to int8 protobuf format using post training quantization.
Refer to the `examples/training/fairseq` directory for more training details.
"""
from collections import OrderedDict
-import argparse
import torch
-import tensorflow as tf
from export.proto.quant_transformer_pb2 import QuantTransformer
from lightseq.training.ops.pytorch.export import export_ls_config
-from lightseq.training.ops.pytorch.export_ptq import (
+from lightseq.training.ops.pytorch.export_quant import (
gather_quant_token_embedding,
fill_quant_pb_layer,
)
from lightseq.training.ops.pytorch.util import get_pos_embedding
import lightseq.inference as lsi
+from export.util import parse_args, save_model
# adjust this value to achieve better performance
@@ -118,8 +117,10 @@ def _get_encode_output_mapping_dict(dec_layer_num):
def export_native_fs_transformer(
- model_dir,
+ model_path,
pb_path,
+ hdf5_path,
+ hdf5,
max_step=300,
bos_id=2,
eos_id=2,
@@ -127,7 +128,7 @@ def export_native_fs_transformer(
):
transformer = QuantTransformer()
# load var names
- reloaded = torch.load(model_dir, "cpu")
+ reloaded = torch.load(model_path, "cpu")
args = reloaded["args"]
model_dict = reloaded["model"]
@@ -267,31 +268,19 @@ def export_native_fs_transformer(
save_pb=True,
)
- print("Writing to {0}".format(pb_path))
- with tf.io.gfile.GFile(pb_path, "wb") as fout:
- fout.write(transformer.SerializeToString())
-
-
-def parse_args():
- parser = argparse.ArgumentParser(description="export fairseq checkpoint", usage="")
- parser.add_argument(
- "--model",
- "-m",
- type=str,
- default="checkpoint_best.pt",
- help="path of fairseq checkpoint",
- )
- args = parser.parse_args()
- return args
+ save_path = save_model(transformer, pb_path, hdf5_path, hdf5)
+ return save_path
if __name__ == "__main__":
args = parse_args()
model_name = ".".join(args.model.split(".")[:-1])
pb_path = f"{model_name}_ptq.pb"
- export_native_fs_transformer(args.model, pb_path)
+ hdf5_path = f"{model_name}_ptq.hdf5"
+ path = export_native_fs_transformer(args.model, pb_path, hdf5_path, args.hdf5)
+ src = [[63, 47, 65, 1507, 88, 74, 10, 2057, 362, 9, 284, 6, 2, 1, 1, 1]]
src = [[63, 47, 65, 1507, 88, 74, 10, 2057, 362, 9, 284, 6, 2, 1, 1, 1]]
- pb_model = lsi.QuantTransformer(pb_path, 8)
- pb_output = pb_model.infer(src)
+ model = lsi.QuantTransformer(path, 8)
+ output = model.infer(src)
# Expected result: [23, 550, 34, 118, 148, 2939, 4, 42, 32, 37, 6, 224, 10, 179, 5, 2]
- print("pb results:", pb_output)
+ print("results:", output)
diff --git a/examples/inference/python/export/huggingface/hf_bart_export.py b/examples/inference/python/export/huggingface/hf_bart_export.py
index 82a0effd..d4f6e519 100644
--- a/examples/inference/python/export/huggingface/hf_bart_export.py
+++ b/examples/inference/python/export/huggingface/hf_bart_export.py
@@ -11,6 +11,7 @@
from lightseq.training.ops.pytorch.export import gather_token_embedding, fill_pb_layer
from export.proto.transformer_pb2 import Transformer
from transformers import BartForConditionalGeneration
+from export.util import parse_args
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
@@ -512,14 +513,15 @@ def _print_pair(key, value):
if __name__ == "__main__":
+ args = parse_args()
+ if args.generation_method not in ["beam_search", "topk", "topp", "topk_greedy"]:
+ args.generation_method = "beam_search"
# if save_proto is True, extension .pb will be added, otherwise .hdf5 is added
output_lightseq_model_name = "lightseq_bart_base" # you can rename it to "lightseq_bart_large" for large model
input_huggingface_bart_model = (
"facebook/bart-base" # Example: you can try "facebook/bart-large" as well
)
head_number = 12 # change this to 16 for "bart-large" model
- # in order to get score, we should use `beam_search` inference method
- generation_method = "beam_search"
beam_size = 4
max_step = 50 # max step for generation, it decides GPU memory occupancy
# maximum_generation_length = min(src_length + extra_decode_length, max_step)
@@ -529,7 +531,7 @@ def _print_pair(key, value):
output_lightseq_model_name,
input_huggingface_bart_model,
head_num=head_number, # layer number
- generation_method=generation_method,
+ generation_method=args.generation_method,
beam_size=beam_size,
max_step=max_step,
extra_decode_length=extra_decode_length,
diff --git a/examples/inference/python/export/huggingface/hf_gpt2_export.py b/examples/inference/python/export/huggingface/hf_gpt2_export.py
index 89a12b42..aa559a10 100644
--- a/examples/inference/python/export/huggingface/hf_gpt2_export.py
+++ b/examples/inference/python/export/huggingface/hf_gpt2_export.py
@@ -7,6 +7,7 @@
from collections import OrderedDict
from transformers import GPT2LMHeadModel
from lightseq.training.ops.pytorch.export import fill_hdf5_layer
+from export.util import parse_args
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
@@ -146,11 +147,12 @@ def _print_pair(key, value):
if __name__ == "__main__":
+ args = parse_args()
+ if args.generation_method not in ["topk", "topp", "ppl"]:
+ args.generation_method = "topk"
output_lightseq_model_name = "lightseq_gpt2_base" # or "lightseq_gpt2_large"
input_huggingface_gpt_model = "gpt2" # or "gpt2-large"
head_number = 12 # 20 for "gpt2-large"
- # generation_method should be "topk" or "topp"
- generation_method = "topk"
topk = 1
topp = 0.75
# default eos_id from https://huggingface.co/transformers/model_doc/gpt2.html#gpt2lmheadmodel
@@ -161,7 +163,7 @@ def _print_pair(key, value):
output_lightseq_model_name,
input_huggingface_gpt_model,
head_num=head_number, # layer number
- generation_method=generation_method,
+ generation_method=args.generation_method,
topk=topk,
topp=topp,
eos_id=eos_id,
diff --git a/examples/inference/python/export/huggingface/ls_torch_hf_quant_bert_export.py b/examples/inference/python/export/huggingface/ls_torch_hf_quant_bert_export.py
new file mode 100644
index 00000000..72f18e90
--- /dev/null
+++ b/examples/inference/python/export/huggingface/ls_torch_hf_quant_bert_export.py
@@ -0,0 +1,208 @@
+"""
+Export Hugging Face quantized BERT models to hdf5 format.
+"""
+import os
+import h5py
+from collections import OrderedDict
+
+import torch
+from lightseq.training.ops.pytorch.export import apply_rule
+from lightseq.training.ops.pytorch.export_quant import quantize
+from export.util import parse_args
+
+os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+
+
+"""
+For the mapping dictionary: key is the value of the proto parameter,
+value is a powerful expression, each && split tensor name of the matching path or expression.
+
+The sub-pattern of the path is separated by spaces, and the expression starts with a expression_.
+You can operate separately on each tensor and support multiple expressions. Multiple matching paths
+and the expression will finally be concatenated on axis = -1.
+"""
+enc_layer_mapping_dict = OrderedDict(
+ {
+ # BERT is post_layernorm
+ "multihead_norm_scale": "self_attn_layer_norm weight",
+ "multihead_norm_bias": "self_attn_layer_norm bias",
+ "multihead_project_kernel_qkv": "self_attn qkv_proj weight&&expression_.transpose(0, 1)",
+ "multihead_project_bias_qkv": "self_attn qkv_proj bias",
+ "multihead_project_kernel_output": "self_attn out_proj weight&&expression_.transpose(0, 1)",
+ "multihead_project_bias_output": "self_attn out_proj bias",
+ "ffn_norm_scale": "final_layer_norm weight",
+ "ffn_norm_bias": "final_layer_norm bias",
+ "ffn_first_kernel": "fc1 weight&&expression_.transpose(0, 1)",
+ "ffn_first_bias": "fc1 bias",
+ "ffn_second_kernel": "fc2 weight&&expression_.transpose(0, 1)",
+ "ffn_second_bias": "fc2 bias",
+ # weight_clip_max
+ "multihead_project_kernel_qkv_clip_max": "self_attn qkv_proj weight_quant clip_value_max",
+ "multihead_project_kernel_output_clip_max": "self_attn out_proj weight_quant clip_value_max",
+ "ffn_first_kernel_clip_max": "fc1 weight_quant clip_value_max",
+ "ffn_second_kernel_clip_max": "fc2 weight_quant clip_value_max",
+ # act_clip_max
+ "multihead_ln_clip_max": "self_attn qkv_proj input_quant clip_value_max",
+ "multihead_project_output_clip_max": "self_attn out_proj input_quant clip_value_max",
+ "ffn_ln_clip_max": "fc1 input_quant clip_value_max",
+ "ffn_first_act_clip_max": "fc2 input_quant clip_value_max",
+ "multihead_qkv_dense_clip_max": "self_attn qkv_proj output_quant clip_value_max",
+ "multihead_output_dense_clip_max": "self_attn out_proj output_quant clip_value_max",
+ "ffn_first_output_clip_max": "fc1 output_quant clip_value_max",
+ }
+)
+
+src_emb_mapping_dict = OrderedDict(
+ {
+ "norm_scale": "embeddings LayerNorm weight",
+ "norm_bias": "embeddings LayerNorm bias",
+ "position_embedding": "embeddings position_embeddings weight",
+ }
+)
+
+
+def fill_quant_hdf5_layer(
+ tensor_names, state_dict, hdf5_file, hdf5_dataset_prefix, mapping_dict
+):
+ for proto_name, ckpt_rule in mapping_dict.items():
+ target_tensor = apply_rule(proto_name, ckpt_rule, tensor_names, state_dict)
+ if proto_name.endswith("_clip_max"):
+ hdf5_file.create_dataset(
+ hdf5_dataset_prefix + proto_name, data=float(target_tensor[0])
+ )
+ else:
+ hdf5_file.create_dataset(
+ hdf5_dataset_prefix + proto_name,
+ data=target_tensor,
+ )
+
+
+def extract_bert_weights(
+ output_file,
+ model_dir,
+ head_num,
+ pad_id=0,
+ max_step=50,
+):
+ # load var names
+ state_dict = torch.load(model_dir, "cpu")
+
+ var_name_list = list(state_dict.keys())
+
+ for name in var_name_list:
+ if name.endswith("weight_quant.clip.clip_value_max"):
+ state_dict[name[:-26]] = torch.Tensor(
+ quantize(state_dict[name[:-26]].numpy(), 127, state_dict[name].numpy())
+ ).to(torch.uint8)
+
+ # initialize output file
+ print("Saving model to hdf5...")
+ print("Writing to {0}".format(output_file))
+ hdf5_file = h5py.File(output_file, "w")
+
+ # fill each encoder layer's params
+ enc_tensor_names = {}
+ for name in var_name_list:
+ name_split = name.split(".")
+ if len(name_split) <= 3 or not name_split[3].isdigit():
+ continue
+ layer_id = int(name_split[3])
+ enc_tensor_names.setdefault(layer_id, []).append(name)
+
+ # fill encoder_stack
+ for layer_id in sorted(enc_tensor_names.keys()):
+ fill_quant_hdf5_layer(
+ enc_tensor_names[layer_id],
+ state_dict,
+ hdf5_file,
+ f"encoder_stack/{layer_id}/",
+ enc_layer_mapping_dict,
+ )
+
+ # fill src_embedding - except for position embedding
+ fill_quant_hdf5_layer(
+ var_name_list,
+ state_dict,
+ hdf5_file,
+ "src_embedding/",
+ src_emb_mapping_dict,
+ )
+
+ # handling token_embeddings for BERT
+ token_embedding = (
+ state_dict["bert.embeddings.word_embeddings.weight"]
+ + state_dict["bert.embeddings.token_type_embeddings.weight"][0]
+ )
+ token_embedding = quantize(
+ token_embedding.numpy(),
+ 127,
+ state_dict["bert.embeddings.emb_quant.clip.clip_value_max"].numpy(),
+ )
+ print(f"processed token_embedding, shape: {token_embedding.shape}")
+ hdf5_file.create_dataset(
+ "src_embedding/token_embedding", data=token_embedding, dtype="uint8"
+ )
+ hdf5_file.create_dataset(
+ "src_embedding/emb_clip_max",
+ data=state_dict["bert.embeddings.emb_quant.clip.clip_value_max"],
+ )
+
+ # save number of layers metadata
+ hdf5_file.create_dataset(
+ "model_conf/n_encoder_stack", data=len(enc_tensor_names), dtype="i4"
+ )
+ # fill in model_conf
+ hdf5_file.create_dataset("model_conf/head_num", data=head_num, dtype="i4")
+ hdf5_file.create_dataset("model_conf/src_padding_id", data=pad_id, dtype="i4")
+ hdf5_file.create_dataset("model_conf/is_post_ln", data=True, dtype="?")
+ hdf5_file.create_dataset("model_conf/use_gelu", data=True, dtype="?")
+
+ # Move layernorm weights to match layernorm implementation in lightseq
+ tmp_scale, tmp_bias = (
+ hdf5_file["src_embedding/norm_scale"][()],
+ hdf5_file["src_embedding/norm_bias"][()],
+ )
+ for layer_id in sorted(enc_tensor_names.keys()):
+ new_tmp_scale = hdf5_file[f"encoder_stack/{layer_id}/multihead_norm_scale"][()]
+ new_tmp_bias = hdf5_file[f"encoder_stack/{layer_id}/multihead_norm_bias"][()]
+ hdf5_file[f"encoder_stack/{layer_id}/multihead_norm_scale"][()] = tmp_scale
+ hdf5_file[f"encoder_stack/{layer_id}/multihead_norm_bias"][()] = tmp_bias
+ tmp_scale, tmp_bias = new_tmp_scale, new_tmp_bias
+
+ new_tmp_scale = hdf5_file[f"encoder_stack/{layer_id}/ffn_norm_scale"][()]
+ new_tmp_bias = hdf5_file[f"encoder_stack/{layer_id}/ffn_norm_bias"][()]
+ hdf5_file[f"encoder_stack/{layer_id}/ffn_norm_scale"][()] = tmp_scale
+ hdf5_file[f"encoder_stack/{layer_id}/ffn_norm_bias"][()] = tmp_bias
+ tmp_scale, tmp_bias = new_tmp_scale, new_tmp_bias
+ hdf5_file["src_embedding/norm_scale"][()] = tmp_scale
+ hdf5_file["src_embedding/norm_bias"][()] = tmp_bias
+
+ hdf5_file.close()
+ # read-in again to double check
+ hdf5_file = h5py.File(output_file, "r")
+
+ def _print_pair(key, value):
+ if key == "sampling_method":
+ value = "".join(map(chr, value[()]))
+ else:
+ value = value[()]
+ print(f"{key}: {value}")
+
+ list(map(lambda x: _print_pair(*x), hdf5_file["model_conf"].items()))
+
+
+if __name__ == "__main__":
+ args = parse_args()
+ model_name = ".".join(args.model.split(".")[:-1])
+ hdf5_path = f"{model_name}.hdf5"
+
+ head_number = 12
+ pad_id = 0
+ max_step = 50
+ extract_bert_weights(
+ hdf5_path,
+ args.model,
+ head_num=head_number,
+ pad_id=pad_id,
+ max_step=max_step,
+ )
diff --git a/examples/inference/python/export/huggingface/ls_torch_hf_quant_gpt2_export.py b/examples/inference/python/export/huggingface/ls_torch_hf_quant_gpt2_export.py
new file mode 100644
index 00000000..b42bb3c8
--- /dev/null
+++ b/examples/inference/python/export/huggingface/ls_torch_hf_quant_gpt2_export.py
@@ -0,0 +1,223 @@
+"""
+Export Hugging Face quantized GPT2 models to hdf5 format.
+"""
+import os
+import h5py
+from collections import OrderedDict
+
+import numpy as np
+import torch
+from lightseq.training.ops.pytorch.export import apply_rule
+from lightseq.training.ops.pytorch.export_quant import quantize
+from export.util import parse_args
+
+os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+
+
+"""
+For the mapping dictionary: key is the value of the proto parameter,
+value is a powerful expression, each && split tensor name of the matching path or expression.
+
+The sub-pattern of the path is separated by spaces, and the expression starts with a expression_.
+You can operate separately on each tensor and support multiple expressions. Multiple matching paths
+and the expression will finally be concatenated on axis = -1.
+"""
+enc_layer_mapping_dict = OrderedDict(
+ {
+ "multihead_norm_scale": "self_attn_layer_norm weight",
+ "multihead_norm_bias": "self_attn_layer_norm bias",
+ "multihead_project_kernel_qkv": "self_attn qkv_proj weight&&expression_.transpose(0, 1)",
+ "multihead_project_bias_qkv": "self_attn qkv_proj bias",
+ "multihead_project_kernel_output": "self_attn out_proj weight&&expression_.transpose(0, 1)",
+ "multihead_project_bias_output": "self_attn out_proj bias",
+ "ffn_norm_scale": "final_layer_norm weight",
+ "ffn_norm_bias": "final_layer_norm bias",
+ "ffn_first_kernel": "fc1 weight&&expression_.transpose(0, 1)",
+ "ffn_first_bias": "fc1 bias",
+ "ffn_second_kernel": "fc2 weight&&expression_.transpose(0, 1)",
+ "ffn_second_bias": "fc2 bias",
+ # weight_clip_max
+ "multihead_project_kernel_qkv_clip_max": "self_attn qkv_proj weight_quant clip_value_max",
+ "multihead_project_kernel_output_clip_max": "self_attn out_proj weight_quant clip_value_max",
+ "ffn_first_kernel_clip_max": "fc1 weight_quant clip_value_max",
+ "ffn_second_kernel_clip_max": "fc2 weight_quant clip_value_max",
+ # act_clip_max
+ "multihead_ln_clip_max": "self_attn qkv_proj input_quant clip_value_max",
+ "multihead_project_output_clip_max": "self_attn out_proj input_quant clip_value_max",
+ "ffn_ln_clip_max": "fc1 input_quant clip_value_max",
+ "ffn_first_act_clip_max": "fc2 input_quant clip_value_max",
+ "multihead_qkv_dense_clip_max": "self_attn qkv_proj output_quant clip_value_max",
+ "multihead_output_dense_clip_max": "self_attn out_proj output_quant clip_value_max",
+ "ffn_first_output_clip_max": "fc1 output_quant clip_value_max",
+ "self_qkv_bias_out_clip_max": "self_attn attention_quant clip_value_max",
+ }
+)
+
+src_emb_mapping_dict = OrderedDict(
+ {
+ "norm_scale": "ln_f weight",
+ "norm_bias": "ln_f bias",
+ "output_ln_clip_max": "lm_head input_quant clip_value_max",
+ "logits_clip_max": "lm_head output_quant clip_value_max",
+ }
+)
+
+
+def fill_quant_hdf5_layer(
+ tensor_names, state_dict, hdf5_file, hdf5_dataset_prefix, mapping_dict
+):
+ for proto_name, ckpt_rule in mapping_dict.items():
+ target_tensor = apply_rule(proto_name, ckpt_rule, tensor_names, state_dict)
+ if proto_name.endswith("_clip_max"):
+ hdf5_file.create_dataset(
+ hdf5_dataset_prefix + proto_name, data=float(target_tensor[0])
+ )
+ else:
+ hdf5_file.create_dataset(
+ hdf5_dataset_prefix + proto_name,
+ data=target_tensor,
+ )
+
+
+def extract_gpt_weights(
+ output_file,
+ model_dir,
+ head_num,
+ generation_method,
+ topk=1,
+ topp=0.75,
+ eos_id=50256,
+ pad_id=50257,
+ max_step=50,
+):
+ # load var names
+ state_dict = torch.load(model_dir, "cpu")
+
+ var_name_list = list(state_dict.keys())
+
+ for name in var_name_list:
+ if name.endswith("weight_quant.clip.clip_value_max"):
+ state_dict[name[:-26]] = torch.Tensor(
+ quantize(state_dict[name[:-26]].numpy(), 127, state_dict[name].numpy())
+ ).to(torch.uint8)
+
+ # initialize output file
+ print("Saving model to hdf5...")
+ print("Writing to {0}".format(output_file))
+ hdf5_file = h5py.File(output_file, "w")
+
+ # fill each encoder layer's params
+ enc_tensor_names = {}
+ for name in var_name_list:
+ name_split = name.split(".")
+ if len(name_split) <= 2 or not name_split[2].isdigit():
+ continue
+ layer_id = int(name_split[2])
+ enc_tensor_names.setdefault(layer_id, []).append(name)
+
+ # fill encoder_stack
+ for layer_id in sorted(enc_tensor_names.keys()):
+ fill_quant_hdf5_layer(
+ enc_tensor_names[layer_id],
+ state_dict,
+ hdf5_file,
+ f"encoder_stack/{layer_id}/",
+ enc_layer_mapping_dict,
+ )
+
+ # fill src_embedding - except for position embedding
+ fill_quant_hdf5_layer(
+ var_name_list,
+ state_dict,
+ hdf5_file,
+ "src_embedding/",
+ src_emb_mapping_dict,
+ )
+
+ # handling token_embeddings for GPT
+ token_embedding = state_dict["transformer.wte.weight"]
+ token_embedding = quantize(
+ token_embedding.numpy(),
+ 127,
+ state_dict["transformer.wte.emb_quant.clip.clip_value_max"].numpy(),
+ ).transpose()
+ print(f"processed token_embedding, shape: {token_embedding.shape}")
+ hdf5_file.create_dataset(
+ "src_embedding/token_embedding", data=token_embedding, dtype="uint8"
+ )
+ hdf5_file.create_dataset(
+ "src_embedding/emb_clip_max",
+ data=state_dict["transformer.wte.emb_quant.clip.clip_value_max"],
+ )
+
+ # special handling for position embedding
+ position_emb = state_dict["transformer.wpe.weight"]
+ _max_allowed_step, _ = position_emb.shape
+ if max_step > _max_allowed_step:
+ print(f"max_step {max_step} exceed max allowed step, abort.")
+ return
+ # truncate position embedding for max_step
+ position_emb = position_emb[:max_step, :]
+ print(
+ f"processed position_embedding with max_step constriant, shape: {position_emb.shape}"
+ )
+ position_emb = position_emb.flatten().tolist()
+ hdf5_file.create_dataset(
+ "src_embedding/position_embedding", data=position_emb, dtype="f4"
+ )
+
+ # save number of layers metadata
+ hdf5_file.create_dataset(
+ "model_conf/n_encoder_stack", data=len(enc_tensor_names), dtype="i4"
+ )
+ # fill in model_conf
+ hdf5_file.create_dataset("model_conf/head_num", data=head_num, dtype="i4")
+ hdf5_file.create_dataset("model_conf/src_padding_id", data=pad_id, dtype="i4")
+ hdf5_file.create_dataset(
+ "model_conf/sampling_method",
+ data=np.array([ord(c) for c in generation_method]).astype(np.int8),
+ dtype="i1",
+ )
+ hdf5_file.create_dataset("model_conf/topp", data=topp, dtype="f4")
+ hdf5_file.create_dataset("model_conf/topk", data=topk, dtype="i4")
+ hdf5_file.create_dataset("model_conf/eos_id", data=eos_id, dtype="i4")
+
+ hdf5_file.close()
+ # read-in again to double check
+ hdf5_file = h5py.File(output_file, "r")
+
+ def _print_pair(key, value):
+ if key == "sampling_method":
+ value = "".join(map(chr, value[()]))
+ else:
+ value = value[()]
+ print(f"{key}: {value}")
+
+ list(map(lambda x: _print_pair(*x), hdf5_file["model_conf"].items()))
+
+
+if __name__ == "__main__":
+ args = parse_args()
+ if args.generation_method not in ["topk", "topp", "ppl"]:
+ args.generation_method = "topk"
+ model_name = ".".join(args.model.split(".")[:-1])
+ hdf5_path = f"{model_name}.hdf5"
+
+ head_number = 12 # 20 for "gpt2-large"
+ topk = 1
+ topp = 0.75
+ # default eos_id from https://huggingface.co/transformers/model_doc/gpt2.html#gpt2lmheadmodel
+ eos_id = 50256
+ pad_id = 50257
+ max_step = 50
+ extract_gpt_weights(
+ hdf5_path,
+ args.model,
+ head_num=head_number, # layer number
+ generation_method=args.generation_method,
+ topk=topk,
+ topp=topp,
+ eos_id=eos_id,
+ pad_id=pad_id,
+ max_step=max_step,
+ )
diff --git a/examples/inference/python/export/ls_transformer_export.py b/examples/inference/python/export/ls_transformer_export.py
index 4f549e81..49b50820 100644
--- a/examples/inference/python/export/ls_transformer_export.py
+++ b/examples/inference/python/export/ls_transformer_export.py
@@ -1,7 +1,8 @@
"""
-Export LightSeq Transformer models to protobuf/hdf5 format.
+Export LightSeq Transformer models to protobuf format.
Refer to the `examples/training/custom` directory for more training details.
"""
+import argparse
import time
import numpy as np
import torch
@@ -142,7 +143,7 @@ def create_data():
)
-def create_model(vocab_size):
+def create_config(vocab_size):
transformer_config = LSTransformer.get_config(
model="transformer-base",
max_batch_tokens=2048,
@@ -154,29 +155,7 @@ def create_model(vocab_size):
fp16=True,
local_rank=0,
)
- model = LSTransformer(transformer_config)
- model.to(dtype=torch.half, device=torch.device("cuda:0"))
- return model
-
-
-def ls_train_predict(ls_train_model, src_tokens, trg_tokens, batch_size):
- """
- NOTE: We do not use beam search here for implementation simplicity.
- """
- torch.cuda.synchronize()
- start_time = time.perf_counter()
- encoder_out, encoder_padding_mask = ls_train_model.encoder(src_tokens)
- predict_tokens = trg_tokens[:, :1]
- cache = {}
- for _ in range(trg_seq_len - 1):
- output = ls_train_model.decoder(
- predict_tokens[:, -1:], encoder_out, encoder_padding_mask, cache
- )
- output = torch.reshape(torch.argmax(output, dim=-1), (batch_size, -1))
- predict_tokens = torch.cat([predict_tokens, output], dim=-1)
- torch.cuda.synchronize()
- end_time = time.perf_counter()
- return predict_tokens, end_time - start_time
+ return transformer_config
def ls_predict(ls_infer_model, src_tokens):
@@ -188,6 +167,19 @@ def ls_predict(ls_infer_model, src_tokens):
return ls_output, end_time - start_time
+def parse_args():
+ parser = argparse.ArgumentParser(description="export LightSeq checkpoint", usage="")
+ parser.add_argument(
+ "--model",
+ "-m",
+ type=str,
+ default="checkpoint_best.pt",
+ help="path of LightSeq checkpoint",
+ )
+ args = parser.parse_args()
+ return args
+
+
if __name__ == "__main__":
(
tokenizer,
@@ -205,34 +197,23 @@ def ls_predict(ls_infer_model, src_tokens):
trg_seq_len,
) = create_data()
- ckpt_path = "checkpoint.pt"
- pb_path = "transformer.pb"
+ args = parse_args()
+ model_name = ".".join(args.model.split(".")[:-1])
+ pb_path = f"{model_name}.pb"
- with open(ckpt_path, "rb") as fin:
+ with open(args.model, "rb") as fin:
state_dict = torch.load(fin, map_location=torch.device("cpu"))
- ls_train_model = create_model(vocab_size)
- ls_train_model.load_state_dict(state_dict)
- ls_train_model.eval()
- print("torch model loaded.")
+ config = create_config(vocab_size)
- export_pb(state_dict, pb_path, pad_id, start_id, end_id, ls_train_model.config)
+ export_pb(state_dict, pb_path, pad_id, start_id, end_id, config)
ls_infer_model = lsi.Transformer(pb_path, 8)
src_tokens_np = np.array(src_tokens.cpu())
print("========================WARM UP========================")
- ls_train_predict(ls_train_model, src_tokens, trg_tokens, batch_size)
ls_predict(ls_infer_model, src_tokens_np)
- print("========================TORCH TEST========================")
- predict_tokens, ls_train_time = ls_train_predict(
- ls_train_model, src_tokens, trg_tokens, batch_size
- )
- mask = torch.cumsum(torch.eq(predict_tokens, end_id).int(), dim=1)
- predict_tokens = predict_tokens.masked_fill(mask > 0, end_id)
- predict_text = tokenizer.batch_decode(predict_tokens, skip_special_tokens=True)
-
print("========================LIGHTSEQ TEST========================")
ls_output, ls_time = ls_predict(ls_infer_model, src_tokens_np)
ls_output = [ids[0] for ids in ls_output[0]]
@@ -242,9 +223,6 @@ def ls_predict(ls_infer_model, src_tokens):
print("\n".join(src_text))
print(">>>>> target text")
print("\n".join(trg_text))
- print(">>>>> lightseq (train) predict text")
- print("\n".join(predict_text))
print(">>>>> lightseq (infer) predict text")
print("\n".join(ls_predict_text))
- print("lightseq (train) predict time: {}ms".format(ls_train_time * 1000))
print("lightseq (infer) predict time: {}ms".format(ls_time * 1000))
diff --git a/examples/inference/python/export/ls_transformer_ptq_export.py b/examples/inference/python/export/ls_transformer_ptq_export.py
index ac4c77b0..6d0e1471 100644
--- a/examples/inference/python/export/ls_transformer_ptq_export.py
+++ b/examples/inference/python/export/ls_transformer_ptq_export.py
@@ -1,8 +1,8 @@
"""
-Export LightSeq fp16/fp32 Transformer models to int8 protobuf format,
-and then using int8 quantization to speedup inference.
+Export LightSeq Transformer models to int8 protobuf format using post training quantization.
Refer to the `examples/training/custom` directory for more training details.
"""
+import argparse
import time
import numpy as np
import torch
@@ -183,6 +183,19 @@ def ls_predict(ls_infer_model, src_tokens):
return ls_output, end_time - start_time
+def parse_args():
+ parser = argparse.ArgumentParser(description="export LightSeq checkpoint", usage="")
+ parser.add_argument(
+ "--model",
+ "-m",
+ type=str,
+ default="checkpoint_best.pt",
+ help="path of LightSeq checkpoint",
+ )
+ args = parser.parse_args()
+ return args
+
+
if __name__ == "__main__":
(
tokenizer,
@@ -200,10 +213,11 @@ def ls_predict(ls_infer_model, src_tokens):
trg_seq_len,
) = create_data()
- ckpt_path = "checkpoint.pt"
- pb_path = "quant_transformer.pb"
+ args = parse_args()
+ model_name = ".".join(args.model.split(".")[:-1])
+ pb_path = f"{model_name}_ptq.pb"
- with open(ckpt_path, "rb") as fin:
+ with open(args.model, "rb") as fin:
state_dict = torch.load(fin, map_location=torch.device("cpu"))
config = create_config(vocab_size)
diff --git a/examples/inference/python/export/util.py b/examples/inference/python/export/util.py
new file mode 100644
index 00000000..7ec3ac24
--- /dev/null
+++ b/examples/inference/python/export/util.py
@@ -0,0 +1,55 @@
+import argparse
+import tensorflow as tf
+import h5py
+
+from export.proto.transformer_pb2 import Transformer
+from lightseq.training import export_pb2hdf5
+from lightseq.training import export_quant_pb2hdf5
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description="export fairseq checkpoint", usage="")
+ parser.add_argument(
+ "--model",
+ "-m",
+ type=str,
+ default="checkpoint_best.pt",
+ help="path of fairseq checkpoint",
+ )
+ parser.add_argument(
+ "--hdf5",
+ "-hdf5",
+ action="store_true",
+ help="whether to store hdf5",
+ )
+ parser.add_argument(
+ "--generation_method",
+ "-g",
+ type=str,
+ default="beam_search",
+ choices=["beam_search", "topk_greedy", "topk", "topp", "ppl"],
+ help="generation method",
+ )
+ args = parser.parse_args()
+ return args
+
+
+def save_model(transformer, pb_path, hdf5_path, hdf5):
+ if not hdf5:
+ try:
+ str_model = transformer.SerializeToString()
+ print("Writing to {0}".format(pb_path))
+ with tf.io.gfile.GFile(pb_path, "wb") as fout:
+ fout.write(str_model)
+ return pb_path
+ except:
+ pass
+
+ print("Writing to {0}".format(hdf5_path))
+ f = h5py.File(hdf5_path, "w")
+ if isinstance(transformer, Transformer):
+ export_pb2hdf5(transformer, f)
+ else:
+ export_quant_pb2hdf5(transformer, f)
+ f.close()
+ return hdf5_path
diff --git a/examples/inference/python/test/ls_bart.py b/examples/inference/python/test/ls_bart.py
index 7738f49c..2e667c44 100644
--- a/examples/inference/python/test/ls_bart.py
+++ b/examples/inference/python/test/ls_bart.py
@@ -71,6 +71,7 @@ def main():
# change to "facebook/bart-large" for large model
hf_model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
hf_model.to("cuda:0")
+ hf_model.eval()
sentences = [
"I love that girl, but does not me.",
diff --git a/examples/inference/python/test/ls_bert.py b/examples/inference/python/test/ls_bert.py
index 7e3b0d4f..baa00a3c 100644
--- a/examples/inference/python/test/ls_bert.py
+++ b/examples/inference/python/test/ls_bert.py
@@ -76,6 +76,7 @@ def main():
print("creating huggingface model...")
hf_model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
hf_model.to("cuda:0")
+ hf_model.eval()
print("creating lightseq model...")
ls_model = LightseqBertClassification("lightseq_bert_base_uncased.hdf5", hf_model)
diff --git a/examples/inference/python/test/ls_fairseq.sh b/examples/inference/python/test/ls_fairseq.sh
index bf6b4d75..9ff1a6d7 100644
--- a/examples/inference/python/test/ls_fairseq.sh
+++ b/examples/inference/python/test/ls_fairseq.sh
@@ -3,7 +3,7 @@
until [[ -z "$1" ]]
do
case $1 in
- --model)
+ -m)
shift; MODEL=$1;
shift;;
*)
diff --git a/examples/inference/python/test/ls_gpt2.py b/examples/inference/python/test/ls_gpt2.py
index bc0f980b..abbd78a6 100644
--- a/examples/inference/python/test/ls_gpt2.py
+++ b/examples/inference/python/test/ls_gpt2.py
@@ -2,30 +2,62 @@
import argparse
import torch
-import numpy as np
import lightseq.inference as lsi
from transformers import GPT2Tokenizer, GPT2LMHeadModel
-def ls_gpt2(model, inputs):
+def ls_gpt2(model, inputs, generation_method="topk"):
torch.cuda.synchronize()
start_time = time.perf_counter()
- generated_ids = model.sample(inputs)
+ results = None
+ if generation_method == "topk" or generation_method == "topp":
+ results = model.sample(inputs)
+ elif generation_method == "ppl":
+ results = model.ppl(inputs)[0]
torch.cuda.synchronize()
end_time = time.perf_counter()
- return generated_ids, end_time - start_time
+ return results, end_time - start_time
-def hf_gpt2(model, inputs, tokenizer):
+def compute_hf_ppl(model, inputs):
+ max_length = 512
+ stride = 512
+ end_loc = 0
+
+ nlls = []
+ for i in range(0, inputs.size(1), stride):
+ begin_loc = max(i + stride - max_length, 0)
+ end_loc = min(i + stride, inputs.size(1))
+ trg_len = end_loc - i
+ input_ids = inputs[:, begin_loc:end_loc].to("cuda:0")
+ target_ids = input_ids.clone()
+ target_ids[:, :-trg_len] = -100
+
+ with torch.no_grad():
+ outputs = model(input_ids, labels=target_ids)
+ neg_log_likelihood = outputs[0] * trg_len
+
+ nlls.append(neg_log_likelihood)
+
+ ppl = torch.stack(nlls).sum() / end_loc
+ return ppl.cpu().numpy()
+
+
+def hf_gpt2(model, inputs, tokenizer, generation_method="topk"):
inputs = inputs.to("cuda:0")
torch.cuda.synchronize()
start_time = time.perf_counter()
- generated_ids = model.generate(
- inputs, max_length=50, pad_token_id=tokenizer.eos_token_id
- )
+ results = None
+ if generation_method == "topk" or generation_method == "topp":
+ results = model.generate(
+ inputs, max_length=50, pad_token_id=tokenizer.eos_token_id
+ )
+ elif generation_method == "ppl":
+ results = compute_hf_ppl(model, inputs)
+
torch.cuda.synchronize()
end_time = time.perf_counter()
- return generated_ids, end_time - start_time
+ return results, end_time - start_time
def ls_generate(model, tokenizer, inputs):
@@ -50,17 +82,49 @@ def hf_generate(model, tokenizer, inputs):
print(sent)
-def warmup(ls_tokenizer, hf_tokenizer, ls_model, hf_model, sentences):
+def ls_ppl(model, tokenizer, inputs):
+ print("=========lightseq=========")
+ print("lightseq calculating ppl...")
+ ls_ppl, ls_time = ls_gpt2(model, inputs, "ppl")
+ print(f"lightseq time: {ls_time}s")
+ print("lightseq results:")
+ print(ls_ppl)
+
+
+def hf_ppl(model, tokenizer, inputs):
+ print("=========huggingface=========")
+ print("huggingface calculating ppl...")
+ hf_ppl, hf_time = hf_gpt2(model, inputs, tokenizer, "ppl")
+ print(f"huggingface time: {hf_time}s")
+ print("huggingface results:")
+ print(hf_ppl)
+
+
+def warmup(
+ ls_tokenizer, hf_tokenizer, ls_model, hf_model, sentences, generation_method
+):
ls_inputs = ls_tokenizer(sentences, return_tensors="pt", padding=True)["input_ids"]
hf_inputs = hf_tokenizer(sentences, return_tensors="pt", padding=True)["input_ids"]
- ls_generate(ls_model, ls_tokenizer, ls_inputs)
- hf_generate(hf_model, hf_tokenizer, hf_inputs)
+ if generation_method == "topk" or generation_method == "topp":
+ ls_generate(ls_model, ls_tokenizer, ls_inputs)
+ hf_generate(hf_model, hf_tokenizer, hf_inputs)
+ elif generation_method == "ppl":
+ ls_ppl(ls_model, ls_tokenizer, ls_inputs)
+ hf_ppl(hf_model, hf_tokenizer, hf_inputs)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--user_input", action="store_true")
+ parser.add_argument(
+ "--generation_method",
+ "-g",
+ type=str,
+ default="topk",
+ choices=["topk", "topp", "ppl"],
+ help="generation method",
+ )
args = parser.parse_args()
print("initializing gpt tokenizer...")
@@ -81,18 +145,26 @@ def main():
print("creating huggingface model...")
hf_model = GPT2LMHeadModel.from_pretrained("gpt2")
hf_model.to("cuda:0")
+ hf_model.eval()
# lightseq gpt perplexity supports batch infer with different lengths,
# but sampling doesn't support
sentences = [
- "My name is GPT",
- "My name is GPT",
- "My name is GPT",
- "My name is GPT",
+ "I love you, but you say that",
+ "I love you, but you say that",
+ "I love you, but you say that",
+ "I love you, but you say that",
]
print("====================START warmup====================")
- warmup(ls_tokenizer, hf_tokenizer, ls_model, hf_model, sentences)
+ warmup(
+ ls_tokenizer,
+ hf_tokenizer,
+ ls_model,
+ hf_model,
+ sentences,
+ args.generation_method,
+ )
print("====================END warmup====================")
while True:
@@ -108,8 +180,12 @@ def main():
"input_ids"
]
- ls_generate(ls_model, ls_tokenizer, ls_inputs)
- hf_generate(hf_model, hf_tokenizer, hf_inputs)
+ if args.generation_method == "topk" or args.generation_method == "topp":
+ ls_generate(ls_model, ls_tokenizer, ls_inputs)
+ hf_generate(hf_model, hf_tokenizer, hf_inputs)
+ elif args.generation_method == "ppl":
+ ls_ppl(ls_model, ls_tokenizer, ls_inputs)
+ hf_ppl(hf_model, hf_tokenizer, hf_inputs)
if not args.user_input:
break
diff --git a/examples/inference/python/test/ls_quant_bert.py b/examples/inference/python/test/ls_quant_bert.py
new file mode 100644
index 00000000..29046866
--- /dev/null
+++ b/examples/inference/python/test/ls_quant_bert.py
@@ -0,0 +1,176 @@
+import time
+
+import torch
+from transformers import BertTokenizer, BertForTokenClassification, BertConfig
+import lightseq.inference as lsi
+from lightseq.training.ops.pytorch.quantization import qat_mode
+from lightseq.training.ops.pytorch.torch_transformer_layers import (
+ BertEmbeddingLayer,
+ TransformerEncoderLayer,
+)
+from export.util import parse_args
+
+
+def ls_bert(model, inputs):
+ torch.cuda.synchronize()
+ start_time = time.perf_counter()
+ ls_output = model.infer(inputs)
+ torch.cuda.synchronize()
+ end_time = time.perf_counter()
+ return ls_output, end_time - start_time
+
+
+def hf_bert(model, inputs, attn_mask):
+ torch.cuda.synchronize()
+ start_time = time.perf_counter()
+ hf_output = model(inputs.to("cuda:0"), attention_mask=attn_mask.to("cuda:0"))
+ torch.cuda.synchronize()
+ end_time = time.perf_counter()
+ return hf_output, end_time - start_time
+
+
+def ls_generate(model, inputs_id):
+ print("=========lightseq=========")
+ print("lightseq generating...")
+ ls_output, ls_time = ls_bert(model, inputs_id)
+ print(f"lightseq time: {ls_time}s")
+ print("lightseq results (class predictions):")
+ print(ls_output.argmax(axis=2).detach().cpu().numpy())
+
+
+def hf_generate(model, inputs_id, attn_mask):
+ print("=========huggingface=========")
+ print("huggingface generating...")
+ hf_output, hf_time = hf_bert(model, inputs_id, attn_mask)
+ print(f"huggingface time: {hf_time}s")
+ print("huggingface results (class predictions):")
+ print(hf_output.logits.argmax(axis=2).detach().cpu().numpy())
+
+
+def warmup(tokenizer, ls_model, hf_model, sentences):
+ inputs = tokenizer(sentences, return_tensors="pt", padding=True)
+ inputs_id = inputs["input_ids"]
+ attn_mask = inputs["attention_mask"]
+
+ ls_generate(ls_model, inputs_id)
+ hf_generate(hf_model, inputs_id, attn_mask)
+
+
+class LightseqBertClassification:
+ def __init__(self, ls_weight_path, hf_model):
+ self.ls_bert = lsi.QuantBert(ls_weight_path, 8)
+ self.classifier = hf_model.classifier
+
+ def infer(self, inputs):
+ last_hidden_states = self.ls_bert.infer(inputs)
+ last_hidden_states = torch.Tensor(last_hidden_states).float()
+ logits = self.classifier(last_hidden_states.to("cuda:0"))
+ return logits
+
+
+def gen_bert_emb_config(config):
+ bert_emb_config = BertEmbeddingLayer.get_config(
+ vocab_size=config.vocab_size,
+ embedding_dim=config.hidden_size,
+ max_batch_tokens=4096,
+ max_seq_len=config.max_position_embeddings,
+ padding_idx=config.pad_token_id,
+ dropout=config.hidden_dropout_prob,
+ fp16=True,
+ local_rank=0,
+ )
+ bert_emb_config.type_vocab_size = config.type_vocab_size
+ bert_emb_config.layer_norm_eps = config.layer_norm_eps
+ return bert_emb_config
+
+
+class LSHFTransformerEncoderLayer(TransformerEncoderLayer):
+ def __init__(self, *args, **kwargs):
+ super(LSHFTransformerEncoderLayer, self).__init__(*args, **kwargs)
+
+ def forward(self, hidden_states, encoder_padding_mask, *args, **kwargs):
+ ls_encoder_padding_mask = encoder_padding_mask / -10000.0
+ ls_encoder_padding_mask = ls_encoder_padding_mask.squeeze()
+ output = super().forward(hidden_states, ls_encoder_padding_mask)
+ return (output, None, None, None)
+
+
+def gen_bert_enc_config(config):
+ bert_enc_config = TransformerEncoderLayer.get_config(
+ max_batch_tokens=4096,
+ max_seq_len=config.max_position_embeddings,
+ hidden_size=config.hidden_size,
+ intermediate_size=config.intermediate_size,
+ nhead=config.num_attention_heads,
+ attn_prob_dropout_ratio=config.attention_probs_dropout_prob,
+ activation_dropout_ratio=config.hidden_dropout_prob,
+ hidden_dropout_ratio=config.hidden_dropout_prob,
+ pre_layer_norm=False,
+ fp16=True,
+ local_rank=0,
+ activation_fn="gelu",
+ )
+ return bert_enc_config
+
+
+def inject_ls_layer(model, config):
+ bert_emb_config = gen_bert_emb_config(config)
+ model.bert.embeddings = BertEmbeddingLayer(bert_emb_config)
+ model.bert.embeddings.apply(qat_mode)
+
+ for i in range(config.num_hidden_layers):
+ bert_enc_config = gen_bert_enc_config(config)
+ model.bert.encoder.layer[i] = LSHFTransformerEncoderLayer(
+ bert_enc_config
+ ).cuda()
+ model.bert.encoder.layer[i].apply(qat_mode)
+
+
+def main():
+ args = parse_args()
+ model_name = ".".join(args.model.split(".")[:-1])
+ ckpt_path = f"{model_name}.bin"
+
+ print("initializing bert config...")
+ config = BertConfig.from_pretrained(
+ "bert-base-uncased", num_labels=9, finetuning_task="ner"
+ )
+
+ print("initializing bert tokenizer...")
+ tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+
+ print("creating huggingface model...")
+ hf_model = BertForTokenClassification.from_pretrained(
+ "bert-base-uncased", config=config
+ )
+ inject_ls_layer(hf_model, config)
+ state_dict = torch.load(ckpt_path, map_location="cpu")
+ hf_model.load_state_dict(state_dict, strict=False)
+ hf_model.to("cuda:0")
+ hf_model.eval()
+
+ print("creating lightseq model...")
+ ls_model = LightseqBertClassification(args.model, hf_model)
+
+ sentences = [
+ "EU rejects German call to boycott British lamb .",
+ "-- Dimitris Kontogiannis , Athens Newsroom +301 3311812-4",
+ "BayerVB sets C$ 100 million six-year bond .",
+ "China says time right for Taiwan talks .",
+ ]
+
+ print("====================START warmup====================")
+ warmup(tokenizer, ls_model, hf_model, sentences)
+ print("====================END warmup====================")
+
+ print("tokenizing the sentences...")
+ inputs = tokenizer(sentences, return_tensors="pt", padding=True)
+ inputs_id = inputs["input_ids"]
+ attn_mask = inputs["attention_mask"]
+
+ ls_generate(ls_model, inputs_id)
+ hf_generate(hf_model, inputs_id, attn_mask)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/examples/inference/python/test/ls_quant_gpt2.py b/examples/inference/python/test/ls_quant_gpt2.py
new file mode 100644
index 00000000..033ac5b4
--- /dev/null
+++ b/examples/inference/python/test/ls_quant_gpt2.py
@@ -0,0 +1,251 @@
+import time
+
+import torch
+from torch import nn
+from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config
+import lightseq.inference as lsi
+from lightseq.training.ops.pytorch.quantization import (
+ qat_mode,
+ QuantLinear,
+ TensorQuantizer,
+ weight_quant_config,
+)
+from lightseq.training.ops.pytorch.torch_transformer_layers import (
+ TransformerDecoderLayer,
+)
+from export.util import parse_args
+
+
+def ls_gpt2(model, inputs, generation_method="topk"):
+ torch.cuda.synchronize()
+ start_time = time.perf_counter()
+ results = None
+ if generation_method == "topk" or generation_method == "topp":
+ results = model.sample(inputs)
+ elif generation_method == "ppl":
+ results = model.ppl(inputs)[0]
+ torch.cuda.synchronize()
+ end_time = time.perf_counter()
+ return results, end_time - start_time
+
+
+def compute_hf_ppl(model, inputs):
+ max_length = 512
+ stride = 512
+ end_loc = 0
+
+ nlls = []
+ for i in range(0, inputs.size(1), stride):
+ begin_loc = max(i + stride - max_length, 0)
+ end_loc = min(i + stride, inputs.size(1))
+ trg_len = end_loc - i
+ input_ids = inputs[:, begin_loc:end_loc].to("cuda:0")
+ target_ids = input_ids.clone()
+ target_ids[:, :-trg_len] = -100
+
+ with torch.no_grad():
+ outputs = model(input_ids, labels=target_ids)
+ neg_log_likelihood = outputs[0] * trg_len
+
+ nlls.append(neg_log_likelihood)
+
+ ppl = torch.stack(nlls).sum() / end_loc
+ return ppl.cpu().numpy()
+
+
+def hf_gpt2(model, inputs, tokenizer, generation_method="topk"):
+ inputs = inputs.to("cuda:0")
+ torch.cuda.synchronize()
+ start_time = time.perf_counter()
+ results = None
+ if generation_method == "topk" or generation_method == "topp":
+ results = model.generate(
+ inputs, max_length=50, pad_token_id=tokenizer.eos_token_id
+ )
+ elif generation_method == "ppl":
+ results = compute_hf_ppl(model, inputs)
+
+ torch.cuda.synchronize()
+ end_time = time.perf_counter()
+ return results, end_time - start_time
+
+
+def ls_generate(model, tokenizer, inputs):
+ print("=========lightseq=========")
+ print("lightseq generating...")
+ ls_res_ids, ls_time = ls_gpt2(model, inputs)
+ ls_res = tokenizer.batch_decode(ls_res_ids, skip_special_tokens=True)
+ print(f"lightseq time: {ls_time}s")
+ print("lightseq results:")
+ for sent in ls_res:
+ print(sent)
+
+
+def hf_generate(model, tokenizer, inputs):
+ print("=========huggingface=========")
+ print("huggingface generating...")
+ hf_res_ids, hf_time = hf_gpt2(model, inputs, tokenizer)
+ hf_res = tokenizer.batch_decode(hf_res_ids, skip_special_tokens=True)
+ print(f"huggingface time: {hf_time}s")
+ print("huggingface results:")
+ for sent in hf_res:
+ print(sent)
+
+
+def ls_ppl(model, tokenizer, inputs):
+ print("=========lightseq=========")
+ print("lightseq calculating ppl...")
+ ls_ppl, ls_time = ls_gpt2(model, inputs, "ppl")
+ print(f"lightseq time: {ls_time}s")
+ print("lightseq results:")
+ print(ls_ppl)
+
+
+def hf_ppl(model, tokenizer, inputs):
+ print("=========huggingface=========")
+ print("huggingface calculating ppl...")
+ hf_ppl, hf_time = hf_gpt2(model, inputs, tokenizer, "ppl")
+ print(f"huggingface time: {hf_time}s")
+ print("huggingface results:")
+ print(hf_ppl)
+
+
+def warmup(
+ ls_tokenizer, hf_tokenizer, ls_model, hf_model, sentences, generation_method
+):
+ ls_inputs = ls_tokenizer(sentences, return_tensors="pt", padding=True)["input_ids"]
+ hf_inputs = hf_tokenizer(sentences, return_tensors="pt", padding=True)["input_ids"]
+
+ if generation_method == "topk" or generation_method == "topp":
+ ls_generate(ls_model, ls_tokenizer, ls_inputs)
+ # hf_generate(hf_model, hf_tokenizer, hf_inputs)
+ elif generation_method == "ppl":
+ ls_ppl(ls_model, ls_tokenizer, ls_inputs)
+ hf_ppl(hf_model, hf_tokenizer, hf_inputs)
+
+
+class GptEmbedding(nn.Embedding):
+ def __init__(self, *args, **kwargs):
+ super(GptEmbedding, self).__init__(*args, **kwargs)
+ self.emb_quant = TensorQuantizer(weight_quant_config)
+
+ def forward(self, input_ids):
+ x = super(GptEmbedding, self).forward(input_ids)
+ x = self.emb_quant(x)
+ return x
+
+
+def gen_gpt_enc_config(config):
+ gpt_enc_config = TransformerDecoderLayer.get_config(
+ max_batch_tokens=8192,
+ max_seq_len=config.max_position_embeddings,
+ hidden_size=config.hidden_size,
+ intermediate_size=4 * config.hidden_size,
+ nhead=config.num_attention_heads,
+ attn_prob_dropout_ratio=config.attn_pdrop,
+ activation_dropout_ratio=config.resid_pdrop,
+ hidden_dropout_ratio=config.resid_pdrop,
+ pre_layer_norm=True,
+ fp16=True,
+ local_rank=0,
+ nlayer=config.num_hidden_layers,
+ activation_fn="gelu",
+ has_cross_attn=False,
+ )
+ return gpt_enc_config
+
+
+class LSHFGptEncoderLayer(TransformerDecoderLayer):
+ def __init__(self, *args, **kwargs):
+ super(LSHFGptEncoderLayer, self).__init__(*args, **kwargs)
+
+ def forward(self, hidden_states, attention_mask=None, *args, **kwargs):
+ if attention_mask is not None:
+ ls_attention_mask = attention_mask.squeeze()
+ else:
+ ls_attention_mask = torch.zeros(hidden_states.size()[:2])
+ output = super().forward(hidden_states, ls_attention_mask)
+ return output
+
+
+def inject_ls_layer(model, config):
+ model.transformer.wte = GptEmbedding(config.vocab_size, config.hidden_size)
+ model.transformer.wte.apply(qat_mode)
+
+ for i in range(config.num_hidden_layers):
+ gpt_enc_config = gen_gpt_enc_config(config)
+ model.transformer.h[i] = LSHFGptEncoderLayer(gpt_enc_config).cuda()
+ model.transformer.h[i].apply(qat_mode)
+
+ q_lm_head = QuantLinear(config.n_embd, config.vocab_size, bias=False)
+ q_lm_head.weight = model.transformer.wte.weight
+ q_lm_head.weight_quant = model.transformer.wte.emb_quant
+ model.lm_head = q_lm_head
+
+
+def main():
+ args = parse_args()
+ if args.generation_method not in ["topk", "topp", "ppl"]:
+ args.generation_method = "topk"
+ model_name = ".".join(args.model.split(".")[:-1])
+ ckpt_path = f"{model_name}.bin"
+
+ print("initializing gpt2 config...")
+ config = GPT2Config.from_pretrained("gpt2")
+
+ print("initializing gpt2 tokenizer...")
+ ls_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+ # lightseq use len(tokenizer) as pad_token in default
+ ls_tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+ print(f"lightseq tokenizer pad token id: {ls_tokenizer.pad_token_id}")
+
+ hf_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+ # use EOS as PAD for huggingface to avoid warning according to https://huggingface.co/blog/how-to-generate while avoid reshaping the model embedding
+ hf_tokenizer.pad_token = hf_tokenizer.eos_token
+ print(f"huggingface tokenizer pad token id: {hf_tokenizer.pad_token_id}")
+
+ print("creating huggingface model...")
+ hf_model = GPT2LMHeadModel.from_pretrained("gpt2", config=config)
+ inject_ls_layer(hf_model, config)
+ state_dict = torch.load(ckpt_path, map_location="cpu")
+ hf_model.load_state_dict(state_dict, strict=False)
+ hf_model.to("cuda:0")
+ hf_model.eval()
+
+ print("creating lightseq model...")
+ ls_model = lsi.QuantGpt(args.model, max_batch_size=16)
+
+ # lightseq gpt perplexity supports batch infer with different lengths,
+ # but sampling doesn't support
+ sentences = [
+ "I love you, but you say that",
+ "I love you, but you say that",
+ "I love you, but you say that",
+ "I love you, but you say that",
+ ]
+
+ print("====================START warmup====================")
+ warmup(
+ ls_tokenizer,
+ hf_tokenizer,
+ ls_model,
+ hf_model,
+ sentences,
+ args.generation_method,
+ )
+ print("====================END warmup====================")
+
+ print("tokenizing the sentences...")
+ ls_inputs = ls_tokenizer(sentences, return_tensors="pt", padding=True)["input_ids"]
+ hf_inputs = hf_tokenizer(sentences, return_tensors="pt", padding=True)["input_ids"]
+
+ if args.generation_method == "topk" or args.generation_method == "topp":
+ ls_generate(ls_model, ls_tokenizer, ls_inputs)
+ # hf_generate(hf_model, hf_tokenizer, hf_inputs)
+ elif args.generation_method == "ppl":
+ ls_ppl(ls_model, ls_tokenizer, ls_inputs)
+ hf_ppl(hf_model, hf_tokenizer, hf_inputs)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/examples/training/custom/README.md b/examples/training/custom/README.md
index be38ed7d..27494f33 100644
--- a/examples/training/custom/README.md
+++ b/examples/training/custom/README.md
@@ -6,7 +6,7 @@ The source inputs of the encoder are batch of sentences and the target outputs o
You can run the example simplely by:
```shell
-python examples/training/custom/run.py
+$ python examples/training/custom/run.py
```
If it runs successfully, you will see the following output:
diff --git a/examples/training/deepspeed/README.md b/examples/training/deepspeed/README.md
index ac078949..63c88ff3 100644
--- a/examples/training/deepspeed/README.md
+++ b/examples/training/deepspeed/README.md
@@ -3,12 +3,12 @@ This repo contains an example for how to use LightSeq to accerate the training o
First you should install these requirements.
```shell
-pip install torch ninja fairseq deepspeed
+$ pip install torch ninja fairseq deepspeed
```
Then you can train a translation task on wmt14 en2de dataset by running the following script:
```shell
-sh examples/training/deepspeed/ds_fairseq_wmt14en2de.sh
+$ sh examples/training/deepspeed/ds_fairseq_wmt14en2de.sh
```
This script firstly download the dataset, and then run native Fairseq training script using DeepSpeed launcher without any other parameter modifications.
diff --git a/examples/training/fairseq/README.md b/examples/training/fairseq/README.md
index 623ad511..093ddfe7 100644
--- a/examples/training/fairseq/README.md
+++ b/examples/training/fairseq/README.md
@@ -1,26 +1,32 @@
# LightSeq for Fairseq
-This repo contains an example for how to use LightSeq to accerate the training of translation task in [Fairseq](https://github.com/pytorch/fairseq).
+This repo contains examples for how to use LightSeq to accerate the training of translation task in [Fairseq](https://github.com/pytorch/fairseq).
First you should install these requirements.
```shell
-pip install lightseq fairseq sacremoses
+$ pip install lightseq fairseq sacremoses
```
## Train
-Then you can train a translation task on wmt14 en2de dataset by running the following script:
+Then you can train a translation task on wmt14 en2de dataset using LightSeq by running the following script:
```shell
-sh examples/training/fairseq/ls_fairseq_wmt14en2de.sh
+$ sh examples/training/fairseq/ls_fairseq_wmt14en2de.sh
```
Or you can use LightSeq modules like `--arch ls_transformer_wmt_en_de_big_t2t`,
by adding `--user-dir=${LIGHTSEQ_DIR}/lightseq/training/cli/fs_modules`
to `fairseq-train`.
+You can use `--use-torch-layer` to replace LightSeq layers with custom Torch layers based on native Fairseq layers.
+
+You can use `--enable-quant` and `--quant-mode qat` to run quantization aware training for subsequent LightSeq fast int8 inference.
+
This script firstly download the dataset and then run native Fairseq
training script using optimized model and optimizer.
The `lightseq-train` command is just a easy-to-use wrapper of `fairseq-train` with adding
LightSeq to `--user-dir`.
+We also provide other training scripts to support custom Torch layers and quantization. All model files have been publicly released. **Refer to [examples/inference/python/README.md](../../../examples/inference/python/README.md) for more training, export and inference details.**
+
LightSeq can achieve about 1.47x speedup using batch size 4096 on 8 V100 GPUs,
compared with original Fairseq implementation. You can delete the `ls` prefix in parameters
to switch to fairseq modules.
@@ -28,7 +34,7 @@ to switch to fairseq modules.
## Evaluation
Then you can evaluate on wmt14 en2de dataset by running the following command:
```shell
-lightseq-validate /tmp/wmt14_en_de/ \
+$ lightseq-validate /tmp/wmt14_en_de/ \
--valid-subset valid \
--path checkpoints/checkpoint_best.pt \
--task translation \
@@ -41,11 +47,11 @@ lightseq-validate /tmp/wmt14_en_de/ \
## Generate
You can also generate on wmt14 en2de dataset by running the following command:
```shell
-lightseq-generate /tmp/wmt14_en_de/ \
+$ lightseq-generate /tmp/wmt14_en_de/ \
--gen-subset test \
--path checkpoints/checkpoint_best.pt \
--task translation \
- --max-tokens 8192 \
+ --batch-size 128 \
--beam 4 \
--lenpen 0.6 \
--fp16 \
diff --git a/examples/training/huggingface/README.md b/examples/training/huggingface/README.md
deleted file mode 100644
index d8686202..00000000
--- a/examples/training/huggingface/README.md
+++ /dev/null
@@ -1,14 +0,0 @@
-# LightSeq for HuggingFace
-
-This repo contains an example for how to use LightSeq to accerate the training of BERT in HuggingFace [Transformers](https://github.com/huggingface/transformers).
-
-We modify the token classification [examples](https://github.com/huggingface/transformers/tree/master/examples/pytorch/token-classification) in HuggingFace Transformers by replacing their encoder layers with the fused ones in LightSeq.
-
-First you should install these requirements.
-
-```shell
-pip install torch ninja transformers seqeval datasets
-```
-
-Then you can easily fine-tunes BERT on CoNLL-2003 by running the bash script `run_ner.sh`
-or on GLUE by `run_glue.sh`. From our tests, speedup is about 1.6x .
diff --git a/examples/training/huggingface/bert/README.md b/examples/training/huggingface/bert/README.md
new file mode 100644
index 00000000..77dde9aa
--- /dev/null
+++ b/examples/training/huggingface/bert/README.md
@@ -0,0 +1,19 @@
+# LightSeq for HuggingFace BERT
+
+This repo contains an example for how to use LightSeq to accerate the training of BERT in HuggingFace [Transformers](https://github.com/huggingface/transformers).
+
+We modify the examples like token classification [examples](https://github.com/huggingface/transformers/tree/master/examples/pytorch/token-classification) in HuggingFace Transformers by replacing their encoder layers with the fused ones in LightSeq.
+
+First you should install these requirements.
+
+```shell
+$ pip install torch ninja transformers seqeval datasets
+```
+
+Before doing next training, you need to switch to the current directory:
+```shell
+$ cd examples/training/huggingface/bert
+```
+
+Then you can easily fine-tunes BERT on different tasks by running the bash scripts `task_ner/run_ner.sh`
+, `task_glue/run_glue.sh`, `task_qa/run_qa.sh`, etc. From our tests, speedup is about 1.6x.
diff --git a/examples/training/huggingface/__init__.py b/examples/training/huggingface/bert/__init__.py
similarity index 100%
rename from examples/training/huggingface/__init__.py
rename to examples/training/huggingface/bert/__init__.py
diff --git a/examples/training/huggingface/bert/ls_hf_transformer_layer.py b/examples/training/huggingface/bert/ls_hf_transformer_layer.py
new file mode 100644
index 00000000..6ad9b8d8
--- /dev/null
+++ b/examples/training/huggingface/bert/ls_hf_transformer_layer.py
@@ -0,0 +1,116 @@
+from lightseq.training.ops.pytorch.quantization import qat_mode, disable_quant
+from lightseq.training.ops.pytorch.torch_transformer_layers import BertEmbeddingLayer
+
+
+def get_hf_bert_enc_layer_params(layer):
+ init_ws = []
+ init_bs = []
+
+ init_ws.append(layer.attention.self.query.weight.detach().clone())
+ init_bs.append(layer.attention.self.query.bias.detach().clone())
+ init_ws.append(layer.attention.self.key.weight.detach().clone())
+ init_bs.append(layer.attention.self.key.bias.detach().clone())
+ init_ws.append(layer.attention.self.value.weight.detach().clone())
+ init_bs.append(layer.attention.self.value.bias.detach().clone())
+ init_ws.append(layer.attention.output.dense.weight.detach().clone())
+ init_bs.append(layer.attention.output.dense.bias.detach().clone())
+ init_ws.append(layer.attention.output.LayerNorm.weight.detach().clone())
+ init_bs.append(layer.attention.output.LayerNorm.bias.detach().clone())
+
+ init_ws.append(layer.intermediate.dense.weight.detach().clone())
+ init_bs.append(layer.intermediate.dense.bias.detach().clone())
+ init_ws.append(layer.output.dense.weight.detach().clone())
+ init_bs.append(layer.output.dense.bias.detach().clone())
+ init_ws.append(layer.output.LayerNorm.weight.detach().clone())
+ init_bs.append(layer.output.LayerNorm.bias.detach().clone())
+
+ return init_ws, init_bs
+
+
+def get_hf_bert_emb_layer_params(layer):
+ init_ws = []
+
+ init_ws.append(layer.word_embeddings.weight.detach().clone())
+ init_ws.append(layer.position_embeddings.weight.detach().clone())
+ init_ws.append(layer.token_type_embeddings.weight.detach().clone())
+ init_ws.append(layer.LayerNorm.weight.detach().clone())
+ init_ws.append(layer.LayerNorm.bias.detach().clone())
+
+ return init_ws
+
+
+def gen_bert_emb_config(training_args, config):
+ bert_emb_config = BertEmbeddingLayer.get_config(
+ vocab_size=config.vocab_size,
+ embedding_dim=config.hidden_size,
+ max_batch_tokens=4096,
+ max_seq_len=config.max_position_embeddings,
+ padding_idx=config.pad_token_id,
+ dropout=config.hidden_dropout_prob,
+ fp16=training_args.fp16,
+ local_rank=training_args.local_rank,
+ )
+ bert_emb_config.type_vocab_size = config.type_vocab_size
+ bert_emb_config.layer_norm_eps = config.layer_norm_eps
+ return bert_emb_config
+
+
+def inject_ls_layer(model, training_args, model_args, config):
+ if model_args.module_type == 2:
+ from lightseq.training.ops.pytorch.torch_transformer_layers import (
+ TransformerEncoderLayer,
+ )
+ elif model_args.module_type == 1:
+ from lightseq.training.ops.pytorch.transformer_encoder_layer import (
+ LSTransformerEncoderLayer as TransformerEncoderLayer,
+ )
+ else:
+ raise NotImplementedError
+
+ if model_args.module_type == 2:
+ bert_emb_config = gen_bert_emb_config(training_args, config)
+ init_ws = get_hf_bert_emb_layer_params(model.bert.embeddings)
+ model.bert.embeddings = BertEmbeddingLayer(bert_emb_config, init_ws)
+ if model_args.enable_quant:
+ model.bert.embeddings.apply(qat_mode)
+ else:
+ model.bert.embeddings.apply(disable_quant)
+
+ class LSHFTransformerEncoderLayer(TransformerEncoderLayer):
+ def __init__(self, *args, **kwargs):
+ super(LSHFTransformerEncoderLayer, self).__init__(*args, **kwargs)
+
+ def forward(self, hidden_states, encoder_padding_mask, *args, **kwargs):
+ ls_encoder_padding_mask = encoder_padding_mask / -10000.0
+ ls_encoder_padding_mask = ls_encoder_padding_mask.squeeze()
+ output = super().forward(hidden_states, ls_encoder_padding_mask)
+ return (output, None, None, None)
+
+ def gen_bert_enc_config(training_args, config):
+ bert_enc_config = TransformerEncoderLayer.get_config(
+ max_batch_tokens=4096,
+ max_seq_len=config.max_position_embeddings,
+ hidden_size=config.hidden_size,
+ intermediate_size=config.intermediate_size,
+ nhead=config.num_attention_heads,
+ attn_prob_dropout_ratio=config.attention_probs_dropout_prob,
+ activation_dropout_ratio=config.hidden_dropout_prob,
+ hidden_dropout_ratio=config.hidden_dropout_prob,
+ pre_layer_norm=False,
+ fp16=training_args.fp16,
+ local_rank=training_args.local_rank,
+ activation_fn="gelu",
+ )
+ return bert_enc_config
+
+ for i in range(config.num_hidden_layers):
+ bert_enc_config = gen_bert_enc_config(training_args, config)
+ init_ws, init_bs = get_hf_bert_enc_layer_params(model.bert.encoder.layer[i])
+ model.bert.encoder.layer[i] = LSHFTransformerEncoderLayer(
+ bert_enc_config, init_ws, init_bs
+ ).cuda()
+ if model_args.module_type == 2:
+ if model_args.enable_quant:
+ model.bert.encoder.layer[i].apply(qat_mode)
+ else:
+ model.bert.encoder.layer[i].apply(disable_quant)
diff --git a/examples/training/huggingface/run_glue.py b/examples/training/huggingface/bert/task_glue/run_glue.py
similarity index 98%
rename from examples/training/huggingface/run_glue.py
rename to examples/training/huggingface/bert/task_glue/run_glue.py
index 1a2274da..0b3b62ca 100644
--- a/examples/training/huggingface/run_glue.py
+++ b/examples/training/huggingface/bert/task_glue/run_glue.py
@@ -45,7 +45,7 @@
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version
from transformers.utils.versions import require_version
-from ls_hf_transformer_encoder_layer import inject_ls_enc_layer
+from ls_hf_transformer_layer import inject_ls_layer
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
@@ -224,9 +224,15 @@ class ModelArguments:
"with private models)."
},
)
- with_lightseq: bool = field(
- default=True,
- metadata={"help": "Whether to use lightseq TransformerEncoder"},
+ module_type: int = field(
+ default=1,
+ metadata={
+ "help": "0: original Hugging Face layer, 1: LightSeq CUDA layer, 2: custom Torch layer"
+ },
+ )
+ enable_quant: bool = field(
+ default=False,
+ metadata={"help": "Whether to enable quantization"},
)
@@ -410,8 +416,8 @@ def main():
)
# Replace with LightSeq encoder layers.
- if model_args.with_lightseq:
- inject_ls_enc_layer(model, training_args, config)
+ if model_args.module_type == 1 or model_args.module_type == 2:
+ inject_ls_layer(model, training_args, model_args, config)
# Preprocessing the datasets
if data_args.task_name is not None:
diff --git a/examples/training/huggingface/run_glue.sh b/examples/training/huggingface/bert/task_glue/run_glue.sh
similarity index 88%
rename from examples/training/huggingface/run_glue.sh
rename to examples/training/huggingface/bert/task_glue/run_glue.sh
index 84fa3c38..a7756ab2 100644
--- a/examples/training/huggingface/run_glue.sh
+++ b/examples/training/huggingface/bert/task_glue/run_glue.sh
@@ -15,22 +15,23 @@
THIS_DIR=$(dirname $(readlink -f $0))
-export TASK_NAME=stsb
+export TASK_NAME=sst2
python3 -m torch.distributed.launch \
--nproc_per_node=1 \
$THIS_DIR/run_glue.py \
- --model_name_or_path bert-large-cased \
+ --model_name_or_path bert-base-cased \
--task_name $TASK_NAME \
--do_train \
--do_eval \
--max_seq_length 128 \
--per_device_train_batch_size 32 \
--learning_rate 2e-5 \
- --num_train_epochs 3 \
+ --num_train_epochs 10 \
--output_dir /tmp/$TASK_NAME/ \
--overwrite_output_dir \
- --with_lightseq true \
--fp16 \
--seed 1234 \
--logging_steps 10 \
+ --module_type 2 \
+ --enable_quant false
diff --git a/examples/training/huggingface/bert/task_glue/run_quant_glue.sh b/examples/training/huggingface/bert/task_glue/run_quant_glue.sh
new file mode 100644
index 00000000..d60e9233
--- /dev/null
+++ b/examples/training/huggingface/bert/task_glue/run_quant_glue.sh
@@ -0,0 +1,38 @@
+# Copyright 2021 The LightSeq Team
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+THIS_DIR=$(dirname $(readlink -f $0))
+
+export TASK_NAME=sst2
+
+python3 -m torch.distributed.launch \
+ --nproc_per_node=1 \
+ $THIS_DIR/run_glue.py \
+ --model_name_or_path bert-base-cased \
+ --task_name $TASK_NAME \
+ --do_train \
+ --do_eval \
+ --max_seq_length 128 \
+ --per_device_train_batch_size 32 \
+ --learning_rate 2e-6 \
+ --num_train_epochs 20 \
+ --output_dir /tmp/quant/$TASK_NAME/ \
+ --overwrite_output_dir \
+ --resume_from_checkpoint /tmp/$TASK_NAME/ \
+ --fp16 \
+ --seed 1234 \
+ --logging_steps 10 \
+ --module_type 2 \
+ --enable_quant true
diff --git a/examples/training/huggingface/bert/task_ner/predict_quant_ner.sh b/examples/training/huggingface/bert/task_ner/predict_quant_ner.sh
new file mode 100644
index 00000000..df81783e
--- /dev/null
+++ b/examples/training/huggingface/bert/task_ner/predict_quant_ner.sh
@@ -0,0 +1,42 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+until [[ -z "$1" ]]
+do
+ case $1 in
+ -m)
+ shift; MODEL=$1;
+ shift;;
+ *)
+ shift;;
+ esac
+done
+
+THIS_DIR=$(dirname $(readlink -f $0))
+
+python3 -m torch.distributed.launch \
+ --nproc_per_node=1 \
+ $THIS_DIR/run_ner.py \
+ --model_name_or_path bert-base-uncased \
+ --dataset_name conll2003 \
+ --do_predict \
+ --per_device_train_batch_size 4 \
+ --output_dir /tmp/quant/test-ner \
+ --overwrite_output_dir \
+ --resume_from_checkpoint $MODEL \
+ --fp16 \
+ --seed 1234 \
+ --logging_steps 10 \
+ --module_type 2 \
+ --enable_quant true
diff --git a/examples/training/huggingface/run_ner.py b/examples/training/huggingface/bert/task_ner/run_ner.py
similarity index 96%
rename from examples/training/huggingface/run_ner.py
rename to examples/training/huggingface/bert/task_ner/run_ner.py
index 1f287bfd..41db6c1d 100644
--- a/examples/training/huggingface/run_ner.py
+++ b/examples/training/huggingface/bert/task_ner/run_ner.py
@@ -28,6 +28,7 @@
import numpy as np
from datasets import ClassLabel, load_dataset, load_metric
+import torch
import transformers
from transformers import (
@@ -43,7 +44,7 @@
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version
-from ls_hf_transformer_encoder_layer import inject_ls_enc_layer
+from ls_hf_transformer_layer import inject_ls_layer
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
@@ -94,9 +95,15 @@ class ModelArguments:
"with private models)."
},
)
- with_lightseq: bool = field(
- default=True,
- metadata={"help": "Whether to use lightseq TransformerEncoder"},
+ module_type: int = field(
+ default=1,
+ metadata={
+ "help": "0: original Hugging Face layer, 1: LightSeq CUDA layer, 2: custom Torch layer"
+ },
+ )
+ enable_quant: bool = field(
+ default=False,
+ metadata={"help": "Whether to enable quantization"},
)
@@ -369,8 +376,8 @@ def get_label_list(labels):
)
# Replace with LightSeq encoder layers.
- if model_args.with_lightseq:
- inject_ls_enc_layer(model, training_args, config)
+ if model_args.module_type == 1 or model_args.module_type == 2:
+ inject_ls_layer(model, training_args, model_args, config)
# Tokenizer check: this script requires a fast tokenizer.
if not isinstance(tokenizer, PreTrainedTokenizerFast):
@@ -513,6 +520,12 @@ def compute_metrics(p):
compute_metrics=compute_metrics,
)
+ if not training_args.do_train:
+ state_dict = torch.load(
+ training_args.resume_from_checkpoint, map_location="cpu"
+ )
+ trainer._load_state_dict_in_model(state_dict)
+
# Training
if training_args.do_train:
checkpoint = None
diff --git a/examples/training/huggingface/run_ner.sh b/examples/training/huggingface/bert/task_ner/run_ner.sh
similarity index 82%
rename from examples/training/huggingface/run_ner.sh
rename to examples/training/huggingface/bert/task_ner/run_ner.sh
index e37695d1..2664fdbb 100644
--- a/examples/training/huggingface/run_ner.sh
+++ b/examples/training/huggingface/bert/task_ner/run_ner.sh
@@ -14,19 +14,19 @@
THIS_DIR=$(dirname $(readlink -f $0))
-if [ -d "/tmp/test-ner/" ]; then
- rm -rf /tmp/test-ner/
-fi
-
python3 -m torch.distributed.launch \
--nproc_per_node=1 \
$THIS_DIR/run_ner.py \
- --model_name_or_path bert-large-uncased \
- --per_device_train_batch_size 16 \
+ --model_name_or_path bert-base-uncased \
--dataset_name conll2003 \
- --output_dir /tmp/test-ner \
--do_train \
--do_eval \
- --num_train_epochs 1 \
- --with_lightseq true \
+ --per_device_train_batch_size 16 \
+ --num_train_epochs 10 \
+ --output_dir /tmp/test-ner \
+ --overwrite_output_dir \
--fp16 \
+ --seed 1234 \
+ --logging_steps 10 \
+ --module_type 2 \
+ --enable_quant false
diff --git a/examples/training/huggingface/run_ner_no_trainer.sh b/examples/training/huggingface/bert/task_ner/run_quant_ner.sh
similarity index 61%
rename from examples/training/huggingface/run_ner_no_trainer.sh
rename to examples/training/huggingface/bert/task_ner/run_quant_ner.sh
index 278aa9cc..3d962e66 100644
--- a/examples/training/huggingface/run_ner_no_trainer.sh
+++ b/examples/training/huggingface/bert/task_ner/run_quant_ner.sh
@@ -14,13 +14,20 @@
THIS_DIR=$(dirname $(readlink -f $0))
-if [ -d "/tmp/test-ner/" ]; then
- rm -rf /tmp/test-ner/
-fi
-
-accelerate launch $THIS_DIR/run_ner_no_trainer.py \
- --model_name_or_path bert-large-uncased \
+python3 -m torch.distributed.launch \
+ --nproc_per_node=1 \
+ $THIS_DIR/run_ner.py \
+ --model_name_or_path bert-base-uncased \
--dataset_name conll2003 \
- --output_dir /tmp/test-ner \
- --task_name ner \
- --num_train_epochs 1
+ --do_train \
+ --do_eval \
+ --per_device_train_batch_size 16 \
+ --num_train_epochs 20 \
+ --output_dir /tmp/quant/test-ner \
+ --overwrite_output_dir \
+ --resume_from_checkpoint /tmp/test-ner/ \
+ --fp16 \
+ --seed 1234 \
+ --logging_steps 10 \
+ --module_type 2 \
+ --enable_quant true
diff --git a/examples/training/huggingface/bert/task_qa/run_qa.py b/examples/training/huggingface/bert/task_qa/run_qa.py
new file mode 100644
index 00000000..83c4fe02
--- /dev/null
+++ b/examples/training/huggingface/bert/task_qa/run_qa.py
@@ -0,0 +1,764 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The LightSeq Team
+# Copyright 2020 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for question answering using a slightly adapted version of the 🤗 Trainer.
+"""
+# You can also adapt this script on your own question answering task. Pointers for this are left as comments.
+
+import logging
+import os
+import sys
+from dataclasses import dataclass, field
+from typing import Optional
+
+import datasets
+from datasets import load_dataset, load_metric
+
+import transformers
+from trainer_qa import QuestionAnsweringTrainer
+from transformers import (
+ AutoConfig,
+ AutoModelForQuestionAnswering,
+ AutoTokenizer,
+ DataCollatorWithPadding,
+ EvalPrediction,
+ HfArgumentParser,
+ PreTrainedTokenizerFast,
+ TrainingArguments,
+ default_data_collator,
+ set_seed,
+)
+from transformers.trainer_utils import get_last_checkpoint
+from transformers.utils import check_min_version
+from transformers.utils.versions import require_version
+from utils_qa import postprocess_qa_predictions
+from ls_hf_transformer_layer import inject_ls_layer
+
+
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.17.0")
+
+require_version(
+ "datasets>=1.8.0",
+ "To fix: pip install -r examples/pytorch/question-answering/requirements.txt",
+)
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ModelArguments:
+ """
+ Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+ """
+
+ model_name_or_path: str = field(
+ metadata={
+ "help": "Path to pretrained model or model identifier from huggingface.co/models"
+ }
+ )
+ config_name: Optional[str] = field(
+ default=None,
+ metadata={
+ "help": "Pretrained config name or path if not the same as model_name"
+ },
+ )
+ tokenizer_name: Optional[str] = field(
+ default=None,
+ metadata={
+ "help": "Pretrained tokenizer name or path if not the same as model_name"
+ },
+ )
+ cache_dir: Optional[str] = field(
+ default=None,
+ metadata={
+ "help": "Path to directory to store the pretrained models downloaded from huggingface.co"
+ },
+ )
+ model_revision: str = field(
+ default="main",
+ metadata={
+ "help": "The specific model version to use (can be a branch name, tag name or commit id)."
+ },
+ )
+ use_auth_token: bool = field(
+ default=False,
+ metadata={
+ "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+ "with private models)."
+ },
+ )
+ module_type: int = field(
+ default=1,
+ metadata={
+ "help": "0: original Hugging Face layer, 1: LightSeq CUDA layer, 2: custom Torch layer"
+ },
+ )
+ enable_quant: bool = field(
+ default=False,
+ metadata={"help": "Whether to enable quantization"},
+ )
+
+
+@dataclass
+class DataTrainingArguments:
+ """
+ Arguments pertaining to what data we are going to input our model for training and eval.
+ """
+
+ dataset_name: Optional[str] = field(
+ default=None,
+ metadata={"help": "The name of the dataset to use (via the datasets library)."},
+ )
+ dataset_config_name: Optional[str] = field(
+ default=None,
+ metadata={
+ "help": "The configuration name of the dataset to use (via the datasets library)."
+ },
+ )
+ train_file: Optional[str] = field(
+ default=None, metadata={"help": "The input training data file (a text file)."}
+ )
+ validation_file: Optional[str] = field(
+ default=None,
+ metadata={
+ "help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."
+ },
+ )
+ test_file: Optional[str] = field(
+ default=None,
+ metadata={
+ "help": "An optional input test data file to evaluate the perplexity on (a text file)."
+ },
+ )
+ overwrite_cache: bool = field(
+ default=False,
+ metadata={"help": "Overwrite the cached training and evaluation sets"},
+ )
+ preprocessing_num_workers: Optional[int] = field(
+ default=None,
+ metadata={"help": "The number of processes to use for the preprocessing."},
+ )
+ max_seq_length: int = field(
+ default=384,
+ metadata={
+ "help": "The maximum total input sequence length after tokenization. Sequences longer "
+ "than this will be truncated, sequences shorter will be padded."
+ },
+ )
+ pad_to_max_length: bool = field(
+ default=True,
+ metadata={
+ "help": "Whether to pad all samples to `max_seq_length`. "
+ "If False, will pad the samples dynamically when batching to the maximum length in the batch (which can "
+ "be faster on GPU but will be slower on TPU)."
+ },
+ )
+ max_train_samples: Optional[int] = field(
+ default=None,
+ metadata={
+ "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
+ "value if set."
+ },
+ )
+ max_eval_samples: Optional[int] = field(
+ default=None,
+ metadata={
+ "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+ "value if set."
+ },
+ )
+ max_predict_samples: Optional[int] = field(
+ default=None,
+ metadata={
+ "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
+ "value if set."
+ },
+ )
+ version_2_with_negative: bool = field(
+ default=False,
+ metadata={"help": "If true, some of the examples do not have an answer."},
+ )
+ null_score_diff_threshold: float = field(
+ default=0.0,
+ metadata={
+ "help": "The threshold used to select the null answer: if the best answer has a score that is less than "
+ "the score of the null answer minus this threshold, the null answer is selected for this example. "
+ "Only useful when `version_2_with_negative=True`."
+ },
+ )
+ doc_stride: int = field(
+ default=128,
+ metadata={
+ "help": "When splitting up a long document into chunks, how much stride to take between chunks."
+ },
+ )
+ n_best_size: int = field(
+ default=20,
+ metadata={
+ "help": "The total number of n-best predictions to generate when looking for an answer."
+ },
+ )
+ max_answer_length: int = field(
+ default=30,
+ metadata={
+ "help": "The maximum length of an answer that can be generated. This is needed because the start "
+ "and end predictions are not conditioned on one another."
+ },
+ )
+
+ def __post_init__(self):
+ if (
+ self.dataset_name is None
+ and self.train_file is None
+ and self.validation_file is None
+ and self.test_file is None
+ ):
+ raise ValueError(
+ "Need either a dataset name or a training/validation file/test_file."
+ )
+ else:
+ if self.train_file is not None:
+ extension = self.train_file.split(".")[-1]
+ assert extension in [
+ "csv",
+ "json",
+ ], "`train_file` should be a csv or a json file."
+ if self.validation_file is not None:
+ extension = self.validation_file.split(".")[-1]
+ assert extension in [
+ "csv",
+ "json",
+ ], "`validation_file` should be a csv or a json file."
+ if self.test_file is not None:
+ extension = self.test_file.split(".")[-1]
+ assert extension in [
+ "csv",
+ "json",
+ ], "`test_file` should be a csv or a json file."
+
+
+def main():
+ # See all possible arguments in src/transformers/training_args.py
+ # or by passing the --help flag to this script.
+ # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+ parser = HfArgumentParser(
+ (ModelArguments, DataTrainingArguments, TrainingArguments)
+ )
+ if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+ # If we pass only one argument to the script and it's the path to a json file,
+ # let's parse it to get our arguments.
+ model_args, data_args, training_args = parser.parse_json_file(
+ json_file=os.path.abspath(sys.argv[1])
+ )
+ else:
+ model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+ # Setup logging
+ logging.basicConfig(
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+ datefmt="%m/%d/%Y %H:%M:%S",
+ handlers=[logging.StreamHandler(sys.stdout)],
+ )
+
+ log_level = training_args.get_process_log_level()
+ logger.setLevel(log_level)
+ datasets.utils.logging.set_verbosity(log_level)
+ transformers.utils.logging.set_verbosity(log_level)
+ transformers.utils.logging.enable_default_handler()
+ transformers.utils.logging.enable_explicit_format()
+
+ # Log on each process the small summary:
+ logger.warning(
+ f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+ + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+ )
+ logger.info(f"Training/evaluation parameters {training_args}")
+
+ # Detecting last checkpoint.
+ last_checkpoint = None
+ if (
+ os.path.isdir(training_args.output_dir)
+ and training_args.do_train
+ and not training_args.overwrite_output_dir
+ ):
+ last_checkpoint = get_last_checkpoint(training_args.output_dir)
+ if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+ raise ValueError(
+ f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+ "Use --overwrite_output_dir to overcome."
+ )
+ elif (
+ last_checkpoint is not None and training_args.resume_from_checkpoint is None
+ ):
+ logger.info(
+ f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+ "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+ )
+
+ # Set seed before initializing model.
+ set_seed(training_args.seed)
+
+ # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+ # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+ # (the dataset will be downloaded automatically from the datasets Hub).
+ #
+ # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
+ # 'text' is found. You can easily tweak this behavior (see below).
+ #
+ # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+ # download the dataset.
+ if data_args.dataset_name is not None:
+ # Downloading and loading a dataset from the hub.
+ raw_datasets = load_dataset(
+ data_args.dataset_name,
+ data_args.dataset_config_name,
+ cache_dir=model_args.cache_dir,
+ )
+ else:
+ data_files = {}
+ if data_args.train_file is not None:
+ data_files["train"] = data_args.train_file
+ extension = data_args.train_file.split(".")[-1]
+
+ if data_args.validation_file is not None:
+ data_files["validation"] = data_args.validation_file
+ extension = data_args.validation_file.split(".")[-1]
+ if data_args.test_file is not None:
+ data_files["test"] = data_args.test_file
+ extension = data_args.test_file.split(".")[-1]
+ raw_datasets = load_dataset(
+ extension,
+ data_files=data_files,
+ field="data",
+ cache_dir=model_args.cache_dir,
+ )
+ # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+ # https://huggingface.co/docs/datasets/loading_datasets.html.
+
+ # Load pretrained model and tokenizer
+ #
+ # Distributed training:
+ # The .from_pretrained methods guarantee that only one local process can concurrently
+ # download model & vocab.
+ config = AutoConfig.from_pretrained(
+ model_args.config_name
+ if model_args.config_name
+ else model_args.model_name_or_path,
+ cache_dir=model_args.cache_dir,
+ revision=model_args.model_revision,
+ use_auth_token=True if model_args.use_auth_token else None,
+ )
+ tokenizer = AutoTokenizer.from_pretrained(
+ model_args.tokenizer_name
+ if model_args.tokenizer_name
+ else model_args.model_name_or_path,
+ cache_dir=model_args.cache_dir,
+ use_fast=True,
+ revision=model_args.model_revision,
+ use_auth_token=True if model_args.use_auth_token else None,
+ )
+ model = AutoModelForQuestionAnswering.from_pretrained(
+ model_args.model_name_or_path,
+ from_tf=bool(".ckpt" in model_args.model_name_or_path),
+ config=config,
+ cache_dir=model_args.cache_dir,
+ revision=model_args.model_revision,
+ use_auth_token=True if model_args.use_auth_token else None,
+ )
+
+ # Replace with LightSeq encoder layers.
+ if model_args.module_type == 1 or model_args.module_type == 2:
+ inject_ls_layer(model, training_args, model_args, config)
+
+ # Tokenizer check: this script requires a fast tokenizer.
+ if not isinstance(tokenizer, PreTrainedTokenizerFast):
+ raise ValueError(
+ "This example script only works for models that have a fast tokenizer. Checkout the big table of models "
+ "at https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet this "
+ "requirement"
+ )
+
+ # Preprocessing the datasets.
+ # Preprocessing is slighlty different for training and evaluation.
+ if training_args.do_train:
+ column_names = raw_datasets["train"].column_names
+ elif training_args.do_eval:
+ column_names = raw_datasets["validation"].column_names
+ else:
+ column_names = raw_datasets["test"].column_names
+ question_column_name = "question" if "question" in column_names else column_names[0]
+ context_column_name = "context" if "context" in column_names else column_names[1]
+ answer_column_name = "answers" if "answers" in column_names else column_names[2]
+
+ # Padding side determines if we do (question|context) or (context|question).
+ pad_on_right = tokenizer.padding_side == "right"
+
+ if data_args.max_seq_length > tokenizer.model_max_length:
+ logger.warning(
+ f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
+ f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
+ )
+ max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
+
+ # Training preprocessing
+ def prepare_train_features(examples):
+ # Some of the questions have lots of whitespace on the left, which is not useful and will make the
+ # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
+ # left whitespace
+ examples[question_column_name] = [
+ q.lstrip() for q in examples[question_column_name]
+ ]
+
+ # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
+ # in one example possible giving several features when a context is long, each of those features having a
+ # context that overlaps a bit the context of the previous feature.
+ tokenized_examples = tokenizer(
+ examples[question_column_name if pad_on_right else context_column_name],
+ examples[context_column_name if pad_on_right else question_column_name],
+ truncation="only_second" if pad_on_right else "only_first",
+ max_length=max_seq_length,
+ stride=data_args.doc_stride,
+ return_overflowing_tokens=True,
+ return_offsets_mapping=True,
+ padding="max_length" if data_args.pad_to_max_length else False,
+ )
+
+ # Since one example might give us several features if it has a long context, we need a map from a feature to
+ # its corresponding example. This key gives us just that.
+ sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+ # The offset mappings will give us a map from token to character position in the original context. This will
+ # help us compute the start_positions and end_positions.
+ offset_mapping = tokenized_examples.pop("offset_mapping")
+
+ # Let's label those examples!
+ tokenized_examples["start_positions"] = []
+ tokenized_examples["end_positions"] = []
+
+ for i, offsets in enumerate(offset_mapping):
+ # We will label impossible answers with the index of the CLS token.
+ input_ids = tokenized_examples["input_ids"][i]
+ cls_index = input_ids.index(tokenizer.cls_token_id)
+
+ # Grab the sequence corresponding to that example (to know what is the context and what is the question).
+ sequence_ids = tokenized_examples.sequence_ids(i)
+
+ # One example can give several spans, this is the index of the example containing this span of text.
+ sample_index = sample_mapping[i]
+ answers = examples[answer_column_name][sample_index]
+ # If no answers are given, set the cls_index as answer.
+ if len(answers["answer_start"]) == 0:
+ tokenized_examples["start_positions"].append(cls_index)
+ tokenized_examples["end_positions"].append(cls_index)
+ else:
+ # Start/end character index of the answer in the text.
+ start_char = answers["answer_start"][0]
+ end_char = start_char + len(answers["text"][0])
+
+ # Start token index of the current span in the text.
+ token_start_index = 0
+ while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
+ token_start_index += 1
+
+ # End token index of the current span in the text.
+ token_end_index = len(input_ids) - 1
+ while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
+ token_end_index -= 1
+
+ # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
+ if not (
+ offsets[token_start_index][0] <= start_char
+ and offsets[token_end_index][1] >= end_char
+ ):
+ tokenized_examples["start_positions"].append(cls_index)
+ tokenized_examples["end_positions"].append(cls_index)
+ else:
+ # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
+ # Note: we could go after the last offset if the answer is the last word (edge case).
+ while (
+ token_start_index < len(offsets)
+ and offsets[token_start_index][0] <= start_char
+ ):
+ token_start_index += 1
+ tokenized_examples["start_positions"].append(token_start_index - 1)
+ while offsets[token_end_index][1] >= end_char:
+ token_end_index -= 1
+ tokenized_examples["end_positions"].append(token_end_index + 1)
+
+ return tokenized_examples
+
+ if training_args.do_train:
+ if "train" not in raw_datasets:
+ raise ValueError("--do_train requires a train dataset")
+ train_dataset = raw_datasets["train"]
+ if data_args.max_train_samples is not None:
+ # We will select sample from whole data if argument is specified
+ train_dataset = train_dataset.select(range(data_args.max_train_samples))
+ # Create train feature from dataset
+ with training_args.main_process_first(desc="train dataset map pre-processing"):
+ train_dataset = train_dataset.map(
+ prepare_train_features,
+ batched=True,
+ num_proc=data_args.preprocessing_num_workers,
+ remove_columns=column_names,
+ load_from_cache_file=not data_args.overwrite_cache,
+ desc="Running tokenizer on train dataset",
+ )
+ if data_args.max_train_samples is not None:
+ # Number of samples might increase during Feature Creation, We select only specified max samples
+ train_dataset = train_dataset.select(range(data_args.max_train_samples))
+
+ # Validation preprocessing
+ def prepare_validation_features(examples):
+ # Some of the questions have lots of whitespace on the left, which is not useful and will make the
+ # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
+ # left whitespace
+ examples[question_column_name] = [
+ q.lstrip() for q in examples[question_column_name]
+ ]
+
+ # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
+ # in one example possible giving several features when a context is long, each of those features having a
+ # context that overlaps a bit the context of the previous feature.
+ tokenized_examples = tokenizer(
+ examples[question_column_name if pad_on_right else context_column_name],
+ examples[context_column_name if pad_on_right else question_column_name],
+ truncation="only_second" if pad_on_right else "only_first",
+ max_length=max_seq_length,
+ stride=data_args.doc_stride,
+ return_overflowing_tokens=True,
+ return_offsets_mapping=True,
+ padding="max_length" if data_args.pad_to_max_length else False,
+ )
+
+ # Since one example might give us several features if it has a long context, we need a map from a feature to
+ # its corresponding example. This key gives us just that.
+ sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+
+ # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
+ # corresponding example_id and we will store the offset mappings.
+ tokenized_examples["example_id"] = []
+
+ for i in range(len(tokenized_examples["input_ids"])):
+ # Grab the sequence corresponding to that example (to know what is the context and what is the question).
+ sequence_ids = tokenized_examples.sequence_ids(i)
+ context_index = 1 if pad_on_right else 0
+
+ # One example can give several spans, this is the index of the example containing this span of text.
+ sample_index = sample_mapping[i]
+ tokenized_examples["example_id"].append(examples["id"][sample_index])
+
+ # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
+ # position is part of the context or not.
+ tokenized_examples["offset_mapping"][i] = [
+ (o if sequence_ids[k] == context_index else None)
+ for k, o in enumerate(tokenized_examples["offset_mapping"][i])
+ ]
+
+ return tokenized_examples
+
+ if training_args.do_eval:
+ if "validation" not in raw_datasets:
+ raise ValueError("--do_eval requires a validation dataset")
+ eval_examples = raw_datasets["validation"]
+ if data_args.max_eval_samples is not None:
+ # We will select sample from whole data
+ eval_examples = eval_examples.select(range(data_args.max_eval_samples))
+ # Validation Feature Creation
+ with training_args.main_process_first(
+ desc="validation dataset map pre-processing"
+ ):
+ eval_dataset = eval_examples.map(
+ prepare_validation_features,
+ batched=True,
+ num_proc=data_args.preprocessing_num_workers,
+ remove_columns=column_names,
+ load_from_cache_file=not data_args.overwrite_cache,
+ desc="Running tokenizer on validation dataset",
+ )
+ if data_args.max_eval_samples is not None:
+ # During Feature creation dataset samples might increase, we will select required samples again
+ eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
+
+ if training_args.do_predict:
+ if "test" not in raw_datasets:
+ raise ValueError("--do_predict requires a test dataset")
+ predict_examples = raw_datasets["test"]
+ if data_args.max_predict_samples is not None:
+ # We will select sample from whole data
+ predict_examples = predict_examples.select(
+ range(data_args.max_predict_samples)
+ )
+ # Predict Feature Creation
+ with training_args.main_process_first(
+ desc="prediction dataset map pre-processing"
+ ):
+ predict_dataset = predict_examples.map(
+ prepare_validation_features,
+ batched=True,
+ num_proc=data_args.preprocessing_num_workers,
+ remove_columns=column_names,
+ load_from_cache_file=not data_args.overwrite_cache,
+ desc="Running tokenizer on prediction dataset",
+ )
+ if data_args.max_predict_samples is not None:
+ # During Feature creation dataset samples might increase, we will select required samples again
+ predict_dataset = predict_dataset.select(
+ range(data_args.max_predict_samples)
+ )
+
+ # Data collator
+ # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data
+ # collator.
+ data_collator = (
+ default_data_collator
+ if data_args.pad_to_max_length
+ else DataCollatorWithPadding(
+ tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None
+ )
+ )
+
+ # Post-processing:
+ def post_processing_function(examples, features, predictions, stage="eval"):
+ # Post-processing: we match the start logits and end logits to answers in the original context.
+ predictions = postprocess_qa_predictions(
+ examples=examples,
+ features=features,
+ predictions=predictions,
+ version_2_with_negative=data_args.version_2_with_negative,
+ n_best_size=data_args.n_best_size,
+ max_answer_length=data_args.max_answer_length,
+ null_score_diff_threshold=data_args.null_score_diff_threshold,
+ output_dir=training_args.output_dir,
+ log_level=log_level,
+ prefix=stage,
+ )
+ # Format the result to the format the metric expects.
+ if data_args.version_2_with_negative:
+ formatted_predictions = [
+ {"id": k, "prediction_text": v, "no_answer_probability": 0.0}
+ for k, v in predictions.items()
+ ]
+ else:
+ formatted_predictions = [
+ {"id": k, "prediction_text": v} for k, v in predictions.items()
+ ]
+
+ references = [
+ {"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples
+ ]
+ return EvalPrediction(predictions=formatted_predictions, label_ids=references)
+
+ metric = load_metric("squad_v2" if data_args.version_2_with_negative else "squad")
+
+ def compute_metrics(p: EvalPrediction):
+ return metric.compute(predictions=p.predictions, references=p.label_ids)
+
+ # Initialize our Trainer
+ trainer = QuestionAnsweringTrainer(
+ model=model,
+ args=training_args,
+ train_dataset=train_dataset if training_args.do_train else None,
+ eval_dataset=eval_dataset if training_args.do_eval else None,
+ eval_examples=eval_examples if training_args.do_eval else None,
+ tokenizer=tokenizer,
+ data_collator=data_collator,
+ post_process_function=post_processing_function,
+ compute_metrics=compute_metrics,
+ )
+
+ # Training
+ if training_args.do_train:
+ checkpoint = None
+ if training_args.resume_from_checkpoint is not None:
+ checkpoint = training_args.resume_from_checkpoint
+ elif last_checkpoint is not None:
+ checkpoint = last_checkpoint
+ train_result = trainer.train(resume_from_checkpoint=checkpoint)
+ trainer.save_model() # Saves the tokenizer too for easy upload
+
+ metrics = train_result.metrics
+ max_train_samples = (
+ data_args.max_train_samples
+ if data_args.max_train_samples is not None
+ else len(train_dataset)
+ )
+ metrics["train_samples"] = min(max_train_samples, len(train_dataset))
+
+ trainer.log_metrics("train", metrics)
+ trainer.save_metrics("train", metrics)
+ trainer.save_state()
+
+ # Evaluation
+ if training_args.do_eval:
+ logger.info("*** Evaluate ***")
+ metrics = trainer.evaluate()
+
+ max_eval_samples = (
+ data_args.max_eval_samples
+ if data_args.max_eval_samples is not None
+ else len(eval_dataset)
+ )
+ metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
+
+ trainer.log_metrics("eval", metrics)
+ trainer.save_metrics("eval", metrics)
+
+ # Prediction
+ if training_args.do_predict:
+ logger.info("*** Predict ***")
+ results = trainer.predict(predict_dataset, predict_examples)
+ metrics = results.metrics
+
+ max_predict_samples = (
+ data_args.max_predict_samples
+ if data_args.max_predict_samples is not None
+ else len(predict_dataset)
+ )
+ metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))
+
+ trainer.log_metrics("predict", metrics)
+ trainer.save_metrics("predict", metrics)
+
+ kwargs = {
+ "finetuned_from": model_args.model_name_or_path,
+ "tasks": "question-answering",
+ }
+ if data_args.dataset_name is not None:
+ kwargs["dataset_tags"] = data_args.dataset_name
+ if data_args.dataset_config_name is not None:
+ kwargs["dataset_args"] = data_args.dataset_config_name
+ kwargs[
+ "dataset"
+ ] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
+ else:
+ kwargs["dataset"] = data_args.dataset_name
+
+ if training_args.push_to_hub:
+ trainer.push_to_hub(**kwargs)
+ else:
+ trainer.create_model_card(**kwargs)
+
+
+def _mp_fn(index):
+ # For xla_spawn (TPUs)
+ main()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/examples/training/huggingface/bert/task_qa/run_qa.sh b/examples/training/huggingface/bert/task_qa/run_qa.sh
new file mode 100644
index 00000000..61346d8d
--- /dev/null
+++ b/examples/training/huggingface/bert/task_qa/run_qa.sh
@@ -0,0 +1,35 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+THIS_DIR=$(dirname $(readlink -f $0))
+
+python3 -m torch.distributed.launch \
+ --nproc_per_node=1 \
+ $THIS_DIR/run_qa.py \
+ --model_name_or_path bert-base-uncased \
+ --dataset_name squad \
+ --do_train \
+ --do_eval \
+ --max_seq_length 256 \
+ --per_device_train_batch_size 16 \
+ --doc_stride 128 \
+ --learning_rate 3e-5 \
+ --num_train_epochs 10 \
+ --output_dir /tmp/squad \
+ --overwrite_output_dir \
+ --fp16 \
+ --seed 1234 \
+ --logging_steps 10 \
+ --module_type 1 \
+ --enable_quant false
diff --git a/examples/training/huggingface/bert/task_qa/trainer_qa.py b/examples/training/huggingface/bert/task_qa/trainer_qa.py
new file mode 100644
index 00000000..c3c2ba01
--- /dev/null
+++ b/examples/training/huggingface/bert/task_qa/trainer_qa.py
@@ -0,0 +1,135 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+A subclass of `Trainer` specific to Question-Answering tasks
+"""
+
+from transformers import Trainer, is_torch_tpu_available
+from transformers.trainer_utils import PredictionOutput
+
+
+if is_torch_tpu_available():
+ import torch_xla.core.xla_model as xm
+ import torch_xla.debug.metrics as met
+
+
+class QuestionAnsweringTrainer(Trainer):
+ def __init__(self, *args, eval_examples=None, post_process_function=None, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.eval_examples = eval_examples
+ self.post_process_function = post_process_function
+
+ def evaluate(
+ self,
+ eval_dataset=None,
+ eval_examples=None,
+ ignore_keys=None,
+ metric_key_prefix: str = "eval",
+ ):
+ eval_dataset = self.eval_dataset if eval_dataset is None else eval_dataset
+ eval_dataloader = self.get_eval_dataloader(eval_dataset)
+ eval_examples = self.eval_examples if eval_examples is None else eval_examples
+
+ # Temporarily disable metric computation, we will do it in the loop here.
+ compute_metrics = self.compute_metrics
+ self.compute_metrics = None
+ eval_loop = (
+ self.prediction_loop
+ if self.args.use_legacy_prediction_loop
+ else self.evaluation_loop
+ )
+ try:
+ output = eval_loop(
+ eval_dataloader,
+ description="Evaluation",
+ # No point gathering the predictions if there are no metrics, otherwise we defer to
+ # self.args.prediction_loss_only
+ prediction_loss_only=True if compute_metrics is None else None,
+ ignore_keys=ignore_keys,
+ )
+ finally:
+ self.compute_metrics = compute_metrics
+
+ if self.post_process_function is not None and self.compute_metrics is not None:
+ eval_preds = self.post_process_function(
+ eval_examples, eval_dataset, output.predictions
+ )
+ metrics = self.compute_metrics(eval_preds)
+
+ # Prefix all keys with metric_key_prefix + '_'
+ for key in list(metrics.keys()):
+ if not key.startswith(f"{metric_key_prefix}_"):
+ metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
+
+ self.log(metrics)
+ else:
+ metrics = {}
+
+ if self.args.tpu_metrics_debug or self.args.debug:
+ # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
+ xm.master_print(met.metrics_report())
+
+ self.control = self.callback_handler.on_evaluate(
+ self.args, self.state, self.control, metrics
+ )
+ return metrics
+
+ def predict(
+ self,
+ predict_dataset,
+ predict_examples,
+ ignore_keys=None,
+ metric_key_prefix: str = "test",
+ ):
+ predict_dataloader = self.get_test_dataloader(predict_dataset)
+
+ # Temporarily disable metric computation, we will do it in the loop here.
+ compute_metrics = self.compute_metrics
+ self.compute_metrics = None
+ eval_loop = (
+ self.prediction_loop
+ if self.args.use_legacy_prediction_loop
+ else self.evaluation_loop
+ )
+ try:
+ output = eval_loop(
+ predict_dataloader,
+ description="Prediction",
+ # No point gathering the predictions if there are no metrics, otherwise we defer to
+ # self.args.prediction_loss_only
+ prediction_loss_only=True if compute_metrics is None else None,
+ ignore_keys=ignore_keys,
+ )
+ finally:
+ self.compute_metrics = compute_metrics
+
+ if self.post_process_function is None or self.compute_metrics is None:
+ return output
+
+ predictions = self.post_process_function(
+ predict_examples, predict_dataset, output.predictions, "predict"
+ )
+ metrics = self.compute_metrics(predictions)
+
+ # Prefix all keys with metric_key_prefix + '_'
+ for key in list(metrics.keys()):
+ if not key.startswith(f"{metric_key_prefix}_"):
+ metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
+
+ return PredictionOutput(
+ predictions=predictions.predictions,
+ label_ids=predictions.label_ids,
+ metrics=metrics,
+ )
diff --git a/examples/training/huggingface/bert/task_qa/utils_qa.py b/examples/training/huggingface/bert/task_qa/utils_qa.py
new file mode 100644
index 00000000..c1c5c10b
--- /dev/null
+++ b/examples/training/huggingface/bert/task_qa/utils_qa.py
@@ -0,0 +1,520 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Post-processing utilities for question answering.
+"""
+import collections
+import json
+import logging
+import os
+from typing import Optional, Tuple
+
+import numpy as np
+from tqdm.auto import tqdm
+
+
+logger = logging.getLogger(__name__)
+
+
+def postprocess_qa_predictions(
+ examples,
+ features,
+ predictions: Tuple[np.ndarray, np.ndarray],
+ version_2_with_negative: bool = False,
+ n_best_size: int = 20,
+ max_answer_length: int = 30,
+ null_score_diff_threshold: float = 0.0,
+ output_dir: Optional[str] = None,
+ prefix: Optional[str] = None,
+ log_level: Optional[int] = logging.WARNING,
+):
+ """
+ Post-processes the predictions of a question-answering model to convert them to answers that are substrings of the
+ original contexts. This is the base postprocessing functions for models that only return start and end logits.
+
+ Args:
+ examples: The non-preprocessed dataset (see the main script for more information).
+ features: The processed dataset (see the main script for more information).
+ predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
+ The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
+ first dimension must match the number of elements of :obj:`features`.
+ version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ Whether or not the underlying dataset contains examples with no answers.
+ n_best_size (:obj:`int`, `optional`, defaults to 20):
+ The total number of n-best predictions to generate when looking for an answer.
+ max_answer_length (:obj:`int`, `optional`, defaults to 30):
+ The maximum length of an answer that can be generated. This is needed because the start and end predictions
+ are not conditioned on one another.
+ null_score_diff_threshold (:obj:`float`, `optional`, defaults to 0):
+ The threshold used to select the null answer: if the best answer has a score that is less than the score of
+ the null answer minus this threshold, the null answer is selected for this example (note that the score of
+ the null answer for an example giving several features is the minimum of the scores for the null answer on
+ each feature: all features must be aligned on the fact they `want` to predict a null answer).
+
+ Only useful when :obj:`version_2_with_negative` is :obj:`True`.
+ output_dir (:obj:`str`, `optional`):
+ If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if
+ :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null
+ answers, are saved in `output_dir`.
+ prefix (:obj:`str`, `optional`):
+ If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
+ log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``):
+ ``logging`` log level (e.g., ``logging.WARNING``)
+ """
+ if len(predictions) != 2:
+ raise ValueError(
+ "`predictions` should be a tuple with two elements (start_logits, end_logits)."
+ )
+ all_start_logits, all_end_logits = predictions
+
+ if len(predictions[0]) != len(features):
+ raise ValueError(
+ f"Got {len(predictions[0])} predictions and {len(features)} features."
+ )
+
+ # Build a map example to its corresponding features.
+ example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
+ features_per_example = collections.defaultdict(list)
+ for i, feature in enumerate(features):
+ features_per_example[example_id_to_index[feature["example_id"]]].append(i)
+
+ # The dictionaries we have to fill.
+ all_predictions = collections.OrderedDict()
+ all_nbest_json = collections.OrderedDict()
+ if version_2_with_negative:
+ scores_diff_json = collections.OrderedDict()
+
+ # Logging.
+ logger.setLevel(log_level)
+ logger.info(
+ f"Post-processing {len(examples)} example predictions split into {len(features)} features."
+ )
+
+ # Let's loop over all the examples!
+ for example_index, example in enumerate(tqdm(examples)):
+ # Those are the indices of the features associated to the current example.
+ feature_indices = features_per_example[example_index]
+
+ min_null_prediction = None
+ prelim_predictions = []
+
+ # Looping through all the features associated to the current example.
+ for feature_index in feature_indices:
+ # We grab the predictions of the model for this feature.
+ start_logits = all_start_logits[feature_index]
+ end_logits = all_end_logits[feature_index]
+ # This is what will allow us to map some the positions in our logits to span of texts in the original
+ # context.
+ offset_mapping = features[feature_index]["offset_mapping"]
+ # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context
+ # available in the current feature.
+ token_is_max_context = features[feature_index].get(
+ "token_is_max_context", None
+ )
+
+ # Update minimum null prediction.
+ feature_null_score = start_logits[0] + end_logits[0]
+ if (
+ min_null_prediction is None
+ or min_null_prediction["score"] > feature_null_score
+ ):
+ min_null_prediction = {
+ "offsets": (0, 0),
+ "score": feature_null_score,
+ "start_logit": start_logits[0],
+ "end_logit": end_logits[0],
+ }
+
+ # Go through all possibilities for the `n_best_size` greater start and end logits.
+ start_indexes = np.argsort(start_logits)[
+ -1 : -n_best_size - 1 : -1
+ ].tolist()
+ end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
+ for start_index in start_indexes:
+ for end_index in end_indexes:
+ # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
+ # to part of the input_ids that are not in the context.
+ if (
+ start_index >= len(offset_mapping)
+ or end_index >= len(offset_mapping)
+ or offset_mapping[start_index] is None
+ or len(offset_mapping[start_index]) < 2
+ or offset_mapping[end_index] is None
+ or len(offset_mapping[end_index]) < 2
+ ):
+ continue
+ # Don't consider answers with a length that is either < 0 or > max_answer_length.
+ if (
+ end_index < start_index
+ or end_index - start_index + 1 > max_answer_length
+ ):
+ continue
+ # Don't consider answer that don't have the maximum context available (if such information is
+ # provided).
+ if (
+ token_is_max_context is not None
+ and not token_is_max_context.get(str(start_index), False)
+ ):
+ continue
+
+ prelim_predictions.append(
+ {
+ "offsets": (
+ offset_mapping[start_index][0],
+ offset_mapping[end_index][1],
+ ),
+ "score": start_logits[start_index] + end_logits[end_index],
+ "start_logit": start_logits[start_index],
+ "end_logit": end_logits[end_index],
+ }
+ )
+ if version_2_with_negative:
+ # Add the minimum null prediction
+ prelim_predictions.append(min_null_prediction)
+ null_score = min_null_prediction["score"]
+
+ # Only keep the best `n_best_size` predictions.
+ predictions = sorted(
+ prelim_predictions, key=lambda x: x["score"], reverse=True
+ )[:n_best_size]
+
+ # Add back the minimum null prediction if it was removed because of its low score.
+ if version_2_with_negative and not any(
+ p["offsets"] == (0, 0) for p in predictions
+ ):
+ predictions.append(min_null_prediction)
+
+ # Use the offsets to gather the answer text in the original context.
+ context = example["context"]
+ for pred in predictions:
+ offsets = pred.pop("offsets")
+ pred["text"] = context[offsets[0] : offsets[1]]
+
+ # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
+ # failure.
+ if len(predictions) == 0 or (
+ len(predictions) == 1 and predictions[0]["text"] == ""
+ ):
+ predictions.insert(
+ 0, {"text": "empty", "start_logit": 0.0, "end_logit": 0.0, "score": 0.0}
+ )
+
+ # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
+ # the LogSumExp trick).
+ scores = np.array([pred.pop("score") for pred in predictions])
+ exp_scores = np.exp(scores - np.max(scores))
+ probs = exp_scores / exp_scores.sum()
+
+ # Include the probabilities in our predictions.
+ for prob, pred in zip(probs, predictions):
+ pred["probability"] = prob
+
+ # Pick the best prediction. If the null answer is not possible, this is easy.
+ if not version_2_with_negative:
+ all_predictions[example["id"]] = predictions[0]["text"]
+ else:
+ # Otherwise we first need to find the best non-empty prediction.
+ i = 0
+ while predictions[i]["text"] == "":
+ i += 1
+ best_non_null_pred = predictions[i]
+
+ # Then we compare to the null prediction using the threshold.
+ score_diff = (
+ null_score
+ - best_non_null_pred["start_logit"]
+ - best_non_null_pred["end_logit"]
+ )
+ scores_diff_json[example["id"]] = float(
+ score_diff
+ ) # To be JSON-serializable.
+ if score_diff > null_score_diff_threshold:
+ all_predictions[example["id"]] = ""
+ else:
+ all_predictions[example["id"]] = best_non_null_pred["text"]
+
+ # Make `predictions` JSON-serializable by casting np.float back to float.
+ all_nbest_json[example["id"]] = [
+ {
+ k: (
+ float(v)
+ if isinstance(v, (np.float16, np.float32, np.float64))
+ else v
+ )
+ for k, v in pred.items()
+ }
+ for pred in predictions
+ ]
+
+ # If we have an output_dir, let's save all those dicts.
+ if output_dir is not None:
+ if not os.path.isdir(output_dir):
+ raise EnvironmentError(f"{output_dir} is not a directory.")
+
+ prediction_file = os.path.join(
+ output_dir,
+ "predictions.json" if prefix is None else f"{prefix}_predictions.json",
+ )
+ nbest_file = os.path.join(
+ output_dir,
+ "nbest_predictions.json"
+ if prefix is None
+ else f"{prefix}_nbest_predictions.json",
+ )
+ if version_2_with_negative:
+ null_odds_file = os.path.join(
+ output_dir,
+ "null_odds.json" if prefix is None else f"{prefix}_null_odds.json",
+ )
+
+ logger.info(f"Saving predictions to {prediction_file}.")
+ with open(prediction_file, "w") as writer:
+ writer.write(json.dumps(all_predictions, indent=4) + "\n")
+ logger.info(f"Saving nbest_preds to {nbest_file}.")
+ with open(nbest_file, "w") as writer:
+ writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+ if version_2_with_negative:
+ logger.info(f"Saving null_odds to {null_odds_file}.")
+ with open(null_odds_file, "w") as writer:
+ writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+ return all_predictions
+
+
+def postprocess_qa_predictions_with_beam_search(
+ examples,
+ features,
+ predictions: Tuple[np.ndarray, np.ndarray],
+ version_2_with_negative: bool = False,
+ n_best_size: int = 20,
+ max_answer_length: int = 30,
+ start_n_top: int = 5,
+ end_n_top: int = 5,
+ output_dir: Optional[str] = None,
+ prefix: Optional[str] = None,
+ log_level: Optional[int] = logging.WARNING,
+):
+ """
+ Post-processes the predictions of a question-answering model with beam search to convert them to answers that are substrings of the
+ original contexts. This is the postprocessing functions for models that return start and end logits, indices, as well as
+ cls token predictions.
+
+ Args:
+ examples: The non-preprocessed dataset (see the main script for more information).
+ features: The processed dataset (see the main script for more information).
+ predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
+ The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
+ first dimension must match the number of elements of :obj:`features`.
+ version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ Whether or not the underlying dataset contains examples with no answers.
+ n_best_size (:obj:`int`, `optional`, defaults to 20):
+ The total number of n-best predictions to generate when looking for an answer.
+ max_answer_length (:obj:`int`, `optional`, defaults to 30):
+ The maximum length of an answer that can be generated. This is needed because the start and end predictions
+ are not conditioned on one another.
+ start_n_top (:obj:`int`, `optional`, defaults to 5):
+ The number of top start logits too keep when searching for the :obj:`n_best_size` predictions.
+ end_n_top (:obj:`int`, `optional`, defaults to 5):
+ The number of top end logits too keep when searching for the :obj:`n_best_size` predictions.
+ output_dir (:obj:`str`, `optional`):
+ If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if
+ :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null
+ answers, are saved in `output_dir`.
+ prefix (:obj:`str`, `optional`):
+ If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
+ log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``):
+ ``logging`` log level (e.g., ``logging.WARNING``)
+ """
+ if len(predictions) != 5:
+ raise ValueError("`predictions` should be a tuple with five elements.")
+ (
+ start_top_log_probs,
+ start_top_index,
+ end_top_log_probs,
+ end_top_index,
+ cls_logits,
+ ) = predictions
+
+ if len(predictions[0]) != len(features):
+ raise ValueError(
+ f"Got {len(predictions[0])} predictions and {len(features)} features."
+ )
+
+ # Build a map example to its corresponding features.
+ example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
+ features_per_example = collections.defaultdict(list)
+ for i, feature in enumerate(features):
+ features_per_example[example_id_to_index[feature["example_id"]]].append(i)
+
+ # The dictionaries we have to fill.
+ all_predictions = collections.OrderedDict()
+ all_nbest_json = collections.OrderedDict()
+ scores_diff_json = collections.OrderedDict() if version_2_with_negative else None
+
+ # Logging.
+ logger.setLevel(log_level)
+ logger.info(
+ f"Post-processing {len(examples)} example predictions split into {len(features)} features."
+ )
+
+ # Let's loop over all the examples!
+ for example_index, example in enumerate(tqdm(examples)):
+ # Those are the indices of the features associated to the current example.
+ feature_indices = features_per_example[example_index]
+
+ min_null_score = None
+ prelim_predictions = []
+
+ # Looping through all the features associated to the current example.
+ for feature_index in feature_indices:
+ # We grab the predictions of the model for this feature.
+ start_log_prob = start_top_log_probs[feature_index]
+ start_indexes = start_top_index[feature_index]
+ end_log_prob = end_top_log_probs[feature_index]
+ end_indexes = end_top_index[feature_index]
+ feature_null_score = cls_logits[feature_index]
+ # This is what will allow us to map some the positions in our logits to span of texts in the original
+ # context.
+ offset_mapping = features[feature_index]["offset_mapping"]
+ # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context
+ # available in the current feature.
+ token_is_max_context = features[feature_index].get(
+ "token_is_max_context", None
+ )
+
+ # Update minimum null prediction
+ if min_null_score is None or feature_null_score < min_null_score:
+ min_null_score = feature_null_score
+
+ # Go through all possibilities for the `n_start_top`/`n_end_top` greater start and end logits.
+ for i in range(start_n_top):
+ for j in range(end_n_top):
+ start_index = int(start_indexes[i])
+ j_index = i * end_n_top + j
+ end_index = int(end_indexes[j_index])
+ # Don't consider out-of-scope answers (last part of the test should be unnecessary because of the
+ # p_mask but let's not take any risk)
+ if (
+ start_index >= len(offset_mapping)
+ or end_index >= len(offset_mapping)
+ or offset_mapping[start_index] is None
+ or offset_mapping[end_index] is None
+ ):
+ continue
+ # Don't consider answers with a length negative or > max_answer_length.
+ if (
+ end_index < start_index
+ or end_index - start_index + 1 > max_answer_length
+ ):
+ continue
+ # Don't consider answer that don't have the maximum context available (if such information is
+ # provided).
+ if (
+ token_is_max_context is not None
+ and not token_is_max_context.get(str(start_index), False)
+ ):
+ continue
+ prelim_predictions.append(
+ {
+ "offsets": (
+ offset_mapping[start_index][0],
+ offset_mapping[end_index][1],
+ ),
+ "score": start_log_prob[i] + end_log_prob[j_index],
+ "start_log_prob": start_log_prob[i],
+ "end_log_prob": end_log_prob[j_index],
+ }
+ )
+
+ # Only keep the best `n_best_size` predictions.
+ predictions = sorted(
+ prelim_predictions, key=lambda x: x["score"], reverse=True
+ )[:n_best_size]
+
+ # Use the offsets to gather the answer text in the original context.
+ context = example["context"]
+ for pred in predictions:
+ offsets = pred.pop("offsets")
+ pred["text"] = context[offsets[0] : offsets[1]]
+
+ # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
+ # failure.
+ if len(predictions) == 0:
+ predictions.insert(
+ 0,
+ {"text": "", "start_logit": -1e-6, "end_logit": -1e-6, "score": -2e-6},
+ )
+
+ # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
+ # the LogSumExp trick).
+ scores = np.array([pred.pop("score") for pred in predictions])
+ exp_scores = np.exp(scores - np.max(scores))
+ probs = exp_scores / exp_scores.sum()
+
+ # Include the probabilities in our predictions.
+ for prob, pred in zip(probs, predictions):
+ pred["probability"] = prob
+
+ # Pick the best prediction and set the probability for the null answer.
+ all_predictions[example["id"]] = predictions[0]["text"]
+ if version_2_with_negative:
+ scores_diff_json[example["id"]] = float(min_null_score)
+
+ # Make `predictions` JSON-serializable by casting np.float back to float.
+ all_nbest_json[example["id"]] = [
+ {
+ k: (
+ float(v)
+ if isinstance(v, (np.float16, np.float32, np.float64))
+ else v
+ )
+ for k, v in pred.items()
+ }
+ for pred in predictions
+ ]
+
+ # If we have an output_dir, let's save all those dicts.
+ if output_dir is not None:
+ if not os.path.isdir(output_dir):
+ raise EnvironmentError(f"{output_dir} is not a directory.")
+
+ prediction_file = os.path.join(
+ output_dir,
+ "predictions.json" if prefix is None else f"{prefix}_predictions.json",
+ )
+ nbest_file = os.path.join(
+ output_dir,
+ "nbest_predictions.json"
+ if prefix is None
+ else f"{prefix}_nbest_predictions.json",
+ )
+ if version_2_with_negative:
+ null_odds_file = os.path.join(
+ output_dir,
+ "null_odds.json" if prefix is None else f"{prefix}_null_odds.json",
+ )
+
+ logger.info(f"Saving predictions to {prediction_file}.")
+ with open(prediction_file, "w") as writer:
+ writer.write(json.dumps(all_predictions, indent=4) + "\n")
+ logger.info(f"Saving nbest_preds to {nbest_file}.")
+ with open(nbest_file, "w") as writer:
+ writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+ if version_2_with_negative:
+ logger.info(f"Saving null_odds to {null_odds_file}.")
+ with open(null_odds_file, "w") as writer:
+ writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+ return all_predictions, scores_diff_json
diff --git a/examples/training/huggingface/gpt/README.md b/examples/training/huggingface/gpt/README.md
index 76bfc84a..fe80f415 100644
--- a/examples/training/huggingface/gpt/README.md
+++ b/examples/training/huggingface/gpt/README.md
@@ -7,8 +7,8 @@ We modify the language modeling [examples](https://github.com/huggingface/transf
First you should install these requirements.
```shell
-pip install -r requirements.txt
-bash run_clm.sh
+$ pip install -r requirements.txt
+$ bash run_clm.sh
```
Before running the script.make sure your pytorch worksfine with cuda, lightseq doesn't support pytorch cpu mode. You can verify your pytorch on CUDA by the following code.
diff --git a/examples/training/huggingface/gpt/__init__.py b/examples/training/huggingface/gpt/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/examples/training/huggingface/gpt/ls_hf_gpt_layer.py b/examples/training/huggingface/gpt/ls_hf_gpt_layer.py
new file mode 100644
index 00000000..90061766
--- /dev/null
+++ b/examples/training/huggingface/gpt/ls_hf_gpt_layer.py
@@ -0,0 +1,129 @@
+from torch import nn
+
+from lightseq.training.ops.pytorch.quantization import (
+ qat_mode,
+ disable_quant,
+ QuantLinear,
+ TensorQuantizer,
+ weight_quant_config,
+)
+from lightseq.training.ops.pytorch.torch_transformer_layers import (
+ TransformerDecoderLayer,
+ copy_para,
+)
+
+
+def get_hf_gpt_enc_layer_params(layer, config):
+ init_ws = []
+ init_bs = []
+
+ init_ws.extend(
+ layer.attn.c_attn.weight.detach().clone().t().split(config.hidden_size, 0)
+ )
+ init_bs.extend(layer.attn.c_attn.bias.detach().clone().split(config.hidden_size, 0))
+
+ init_ws.append(layer.attn.c_proj.weight.detach().clone().t().reshape(-1))
+ init_bs.append(layer.attn.c_proj.bias.detach().clone())
+ init_ws.append(layer.ln_1.weight.detach().clone())
+ init_bs.append(layer.ln_1.bias.detach().clone())
+
+ init_ws.append(layer.mlp.c_fc.weight.detach().clone().t().reshape(-1))
+ init_bs.append(layer.mlp.c_fc.bias.detach().clone())
+ init_ws.append(layer.mlp.c_proj.weight.detach().clone().t().reshape(-1))
+ init_bs.append(layer.mlp.c_proj.bias.detach().clone())
+ init_ws.append(layer.ln_2.weight.detach().clone())
+ init_bs.append(layer.ln_2.bias.detach().clone())
+
+ return init_ws, init_bs
+
+
+def get_hf_gpt_emb_layer_params(layer):
+ init_ws = []
+
+ init_ws.append(layer.wte.weight.detach().clone())
+ init_ws.append(layer.wpe.weight.detach().clone())
+
+ return init_ws
+
+
+def gen_gpt_enc_config(training_args, config):
+ gpt_enc_config = TransformerDecoderLayer.get_config(
+ max_batch_tokens=8192,
+ max_seq_len=config.max_position_embeddings,
+ hidden_size=config.hidden_size,
+ intermediate_size=4 * config.hidden_size,
+ nhead=config.num_attention_heads,
+ attn_prob_dropout_ratio=config.attn_pdrop,
+ activation_dropout_ratio=config.resid_pdrop,
+ hidden_dropout_ratio=config.resid_pdrop,
+ pre_layer_norm=True,
+ fp16=training_args.fp16,
+ local_rank=training_args.local_rank,
+ nlayer=config.num_hidden_layers,
+ activation_fn="gelu",
+ has_cross_attn=False,
+ )
+ return gpt_enc_config
+
+
+class LSHFGptEncoderLayer(TransformerDecoderLayer):
+ def __init__(self, *args, **kwargs):
+ super(LSHFGptEncoderLayer, self).__init__(*args, **kwargs)
+
+ def forward(self, hidden_states, attention_mask=None, *args, **kwargs):
+ if attention_mask is not None:
+ ls_attention_mask = attention_mask.squeeze()
+ else:
+ ls_attention_mask = torch.zeros(hidden_states.size()[:2])
+ output = super().forward(hidden_states, ls_attention_mask)
+ return output
+
+
+class GptEmbedding(nn.Embedding):
+ def __init__(self, training_args, initial_embeddings=None, *args, **kwargs):
+ super(GptEmbedding, self).__init__(*args, **kwargs)
+ self.emb_quant = TensorQuantizer(weight_quant_config)
+
+ if initial_embeddings is not None:
+ self.weight.data.copy_(copy_para(initial_embeddings, training_args.fp16))
+
+ def forward(self, input_ids):
+ x = super(GptEmbedding, self).forward(input_ids)
+ x = self.emb_quant(x)
+ return x
+
+
+def inject_ls_layer(model, training_args, model_args, config):
+ if model_args.module_type == 1:
+ from lightseq.training import ls_hf_gpt_enc_convert
+
+ ls_hf_gpt_enc_convert(model, training_args, config)
+ return
+
+ if model_args.module_type != 2:
+ raise NotImplementedError
+
+ init_ws = get_hf_gpt_emb_layer_params(model.transformer)
+ model.transformer.wte = GptEmbedding(
+ training_args, init_ws[0], config.vocab_size, config.hidden_size
+ )
+ if model_args.enable_quant:
+ model.transformer.wte.apply(qat_mode)
+ else:
+ model.transformer.wte.apply(disable_quant)
+
+ for i in range(config.num_hidden_layers):
+ gpt_enc_config = gen_gpt_enc_config(training_args, config)
+ init_ws, init_bs = get_hf_gpt_enc_layer_params(model.transformer.h[i], config)
+ model.transformer.h[i] = LSHFGptEncoderLayer(
+ gpt_enc_config, init_ws, init_bs
+ ).cuda()
+ if model_args.enable_quant:
+ model.transformer.h[i].apply(qat_mode)
+ else:
+ model.transformer.h[i].apply(disable_quant)
+
+ q_lm_head = QuantLinear(config.n_embd, config.vocab_size, bias=False)
+ q_lm_head.weight = model.transformer.wte.weight
+ q_lm_head.weight_quant = model.transformer.wte.emb_quant
+ model.lm_head = q_lm_head
diff --git a/examples/training/huggingface/gpt/run_clm.py b/examples/training/huggingface/gpt/run_clm.py
index 90b9dd8d..52dfc223 100644
--- a/examples/training/huggingface/gpt/run_clm.py
+++ b/examples/training/huggingface/gpt/run_clm.py
@@ -33,6 +33,7 @@
import datasets
from datasets import load_dataset
+import torch
import transformers
from transformers import (
CONFIG_MAPPING,
@@ -50,8 +51,7 @@
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version
from transformers.utils.versions import require_version
-
-from lightseq.training import ls_hf_gpt_convert
+from ls_hf_gpt_layer import inject_ls_layer
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
@@ -133,9 +133,15 @@ class ModelArguments:
"with private models)."
},
)
- with_lightseq: bool = field(
- default=True,
- metadata={"help": "Whether to use lightseq"},
+ module_type: int = field(
+ default=1,
+ metadata={
+ "help": "0: original Hugging Face layer, 1: LightSeq CUDA layer, 2: custom Torch layer"
+ },
+ )
+ enable_quant: bool = field(
+ default=False,
+ metadata={"help": "Whether to enable quantization"},
)
def __post_init__(self):
@@ -436,8 +442,8 @@ def main():
)
# Replace with LightSeq encoder layers.
- if model_args.with_lightseq:
- ls_hf_gpt_convert(model, training_args, config)
+ if model_args.module_type == 1 or model_args.module_type == 2:
+ inject_ls_layer(model, training_args, model_args, config)
model.resize_token_embeddings(len(tokenizer))
@@ -548,6 +554,12 @@ def group_texts(examples):
data_collator=default_data_collator,
)
+ if not training_args.do_train:
+ state_dict = torch.load(
+ training_args.resume_from_checkpoint, map_location="cpu"
+ )
+ trainer._load_state_dict_in_model(state_dict)
+
# Training
if training_args.do_train:
checkpoint = None
diff --git a/examples/training/huggingface/gpt/run_clm.sh b/examples/training/huggingface/gpt/run_clm.sh
index 863a8b97..30449bc4 100644
--- a/examples/training/huggingface/gpt/run_clm.sh
+++ b/examples/training/huggingface/gpt/run_clm.sh
@@ -8,12 +8,15 @@ python3 -m torch.distributed.launch \
--model_name_or_path gpt2 \
--dataset_name wikitext \
--dataset_config_name wikitext-103-raw-v1 \
- --per_device_train_batch_size 8 \
+ --per_device_train_batch_size 16 \
--per_device_eval_batch_size 8 \
+ --num_train_epochs 1 \
--do_train \
--do_eval \
--output_dir /tmp/test-clm \
--overwrite_output_dir \
--fp16 \
--logging_steps 10 \
- --block_size 512
+ --block_size 512 \
+ --module_type 2 \
+ --enable_quant false
diff --git a/examples/training/huggingface/gpt/run_quant_clm.sh b/examples/training/huggingface/gpt/run_quant_clm.sh
new file mode 100644
index 00000000..196e6434
--- /dev/null
+++ b/examples/training/huggingface/gpt/run_quant_clm.sh
@@ -0,0 +1,23 @@
+#! /bin/bash
+
+THIS_DIR=$(dirname $(readlink -f $0))
+
+python3 -m torch.distributed.launch \
+ --nproc_per_node=1 \
+ $THIS_DIR/run_clm.py \
+ --model_name_or_path gpt2 \
+ --dataset_name wikitext \
+ --dataset_config_name wikitext-103-raw-v1 \
+ --per_device_train_batch_size 16 \
+ --per_device_eval_batch_size 8 \
+ --num_train_epochs 2 \
+ --do_train \
+ --do_eval \
+ --output_dir /tmp/quant/test-clm \
+ --overwrite_output_dir \
+ --resume_from_checkpoint /tmp/test-clm \
+ --fp16 \
+ --logging_steps 10 \
+ --block_size 512 \
+ --module_type 2 \
+ --enable_quant true
diff --git a/examples/training/huggingface/ls_hf_transformer_encoder_layer.py b/examples/training/huggingface/ls_hf_transformer_encoder_layer.py
deleted file mode 100644
index 38db61fe..00000000
--- a/examples/training/huggingface/ls_hf_transformer_encoder_layer.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import random
-
-from lightseq.training.ops.pytorch.transformer_encoder_layer import (
- LSTransformerEncoderLayer,
-)
-
-
-class LSHFTransformerEncoderLayer(LSTransformerEncoderLayer):
- def __init__(self, *args, **kwargs):
- super(LSHFTransformerEncoderLayer, self).__init__(*args, **kwargs)
-
- def forward(self, hidden_states, encoder_padding_mask, *args, **kwargs):
- ls_encoder_padding_mask = encoder_padding_mask / -10000.0
- ls_encoder_padding_mask = ls_encoder_padding_mask.squeeze()
- output = super().forward(hidden_states, ls_encoder_padding_mask)
- return (output, None, None, None)
-
-
-def gen_bert_config(training_args, config):
- bert_config = LSTransformerEncoderLayer.get_config(
- max_batch_tokens=4096,
- max_seq_len=config.max_position_embeddings,
- hidden_size=config.hidden_size,
- intermediate_size=config.intermediate_size,
- nhead=config.num_attention_heads,
- attn_prob_dropout_ratio=config.attention_probs_dropout_prob,
- activation_dropout_ratio=config.hidden_dropout_prob,
- hidden_dropout_ratio=config.hidden_dropout_prob,
- pre_layer_norm=False,
- fp16=training_args.fp16,
- local_rank=training_args.local_rank,
- activation_fn="gelu",
- )
- return bert_config
-
-
-def get_hf_bert_enc_layer_params(layer):
- init_ws = []
- init_bs = []
-
- init_ws.append(layer.attention.self.query.weight.detach().clone())
- init_bs.append(layer.attention.self.query.bias.detach().clone())
- init_ws.append(layer.attention.self.key.weight.detach().clone())
- init_bs.append(layer.attention.self.key.bias.detach().clone())
- init_ws.append(layer.attention.self.value.weight.detach().clone())
- init_bs.append(layer.attention.self.value.bias.detach().clone())
- init_ws.append(layer.attention.output.dense.weight.detach().clone())
- init_bs.append(layer.attention.output.dense.bias.detach().clone())
- init_ws.append(layer.attention.output.LayerNorm.weight.detach().clone())
- init_bs.append(layer.attention.output.LayerNorm.bias.detach().clone())
-
- init_ws.append(layer.intermediate.dense.weight.detach().clone())
- init_bs.append(layer.intermediate.dense.bias.detach().clone())
- init_ws.append(layer.output.dense.weight.detach().clone())
- init_bs.append(layer.output.dense.bias.detach().clone())
- init_ws.append(layer.output.LayerNorm.weight.detach().clone())
- init_bs.append(layer.output.LayerNorm.bias.detach().clone())
-
- return init_ws, init_bs
-
-
-def inject_ls_enc_layer(model, training_args, config):
- for i in range(config.num_hidden_layers):
- bert_config = gen_bert_config(training_args, config)
- init_ws, init_bs = get_hf_bert_enc_layer_params(model.bert.encoder.layer[i])
- model.bert.encoder.layer[i] = LSHFTransformerEncoderLayer(
- bert_config, init_ws, init_bs
- ).cuda()
diff --git a/examples/training/huggingface/run_ner_no_trainer.py b/examples/training/huggingface/run_ner_no_trainer.py
deleted file mode 100644
index 88db653b..00000000
--- a/examples/training/huggingface/run_ner_no_trainer.py
+++ /dev/null
@@ -1,618 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Fine-tuning a 🤗 Transformers model on token classification tasks (NER, POS, CHUNKS) relying on the accelerate library
-without using a Trainer.
-"""
-
-import argparse
-import logging
-import math
-import os
-import random
-
-import datasets
-import torch
-from datasets import ClassLabel, load_dataset, load_metric
-from torch.utils.data.dataloader import DataLoader
-from tqdm.auto import tqdm
-
-import transformers
-from accelerate import Accelerator
-from transformers import (
- CONFIG_MAPPING,
- MODEL_MAPPING,
- AdamW,
- AutoConfig,
- AutoModelForTokenClassification,
- AutoTokenizer,
- DataCollatorForTokenClassification,
- SchedulerType,
- default_data_collator,
- get_scheduler,
- set_seed,
-)
-from ls_hf_transformer_encoder_layer import inject_ls_enc_layer
-
-logger = logging.getLogger(__name__)
-# You should update this to your particular problem to have better documentation of `model_type`
-MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
-MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
-
-
-def parse_args():
- parser = argparse.ArgumentParser(
- description="Finetune a transformers model on a text classification task (NER) with accelerate library"
- )
- parser.add_argument(
- "--dataset_name",
- type=str,
- default=None,
- help="The name of the dataset to use (via the datasets library).",
- )
- parser.add_argument(
- "--dataset_config_name",
- type=str,
- default=None,
- help="The configuration name of the dataset to use (via the datasets library).",
- )
- parser.add_argument(
- "--train_file",
- type=str,
- default=None,
- help="A csv or a json file containing the training data.",
- )
- parser.add_argument(
- "--validation_file",
- type=str,
- default=None,
- help="A csv or a json file containing the validation data.",
- )
- parser.add_argument(
- "--max_length",
- type=int,
- default=128,
- help=(
- "The maximum total input sequence length after tokenization. Sequences longer than this will be truncated,"
- " sequences shorter will be padded if `--pad_to_max_lenght` is passed."
- ),
- )
- parser.add_argument(
- "--pad_to_max_length",
- action="store_true",
- help="If passed, pad all samples to `max_length`. Otherwise, dynamic padding is used.",
- )
- parser.add_argument(
- "--model_name_or_path",
- type=str,
- help="Path to pretrained model or model identifier from huggingface.co/models.",
- required=True,
- )
- parser.add_argument(
- "--config_name",
- type=str,
- default=None,
- help="Pretrained config name or path if not the same as model_name",
- )
- parser.add_argument(
- "--tokenizer_name",
- type=str,
- default=None,
- help="Pretrained tokenizer name or path if not the same as model_name",
- )
- parser.add_argument(
- "--per_device_train_batch_size",
- type=int,
- default=8,
- help="Batch size (per device) for the training dataloader.",
- )
- parser.add_argument(
- "--per_device_eval_batch_size",
- type=int,
- default=8,
- help="Batch size (per device) for the evaluation dataloader.",
- )
- parser.add_argument(
- "--learning_rate",
- type=float,
- default=5e-5,
- help="Initial learning rate (after the potential warmup period) to use.",
- )
- parser.add_argument(
- "--weight_decay", type=float, default=0.0, help="Weight decay to use."
- )
- parser.add_argument(
- "--num_train_epochs",
- type=int,
- default=3,
- help="Total number of training epochs to perform.",
- )
- parser.add_argument(
- "--max_train_steps",
- type=int,
- default=None,
- help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
- )
- parser.add_argument(
- "--gradient_accumulation_steps",
- type=int,
- default=1,
- help="Number of updates steps to accumulate before performing a backward/update pass.",
- )
- parser.add_argument(
- "--lr_scheduler_type",
- type=SchedulerType,
- default="linear",
- help="The scheduler type to use.",
- choices=[
- "linear",
- "cosine",
- "cosine_with_restarts",
- "polynomial",
- "constant",
- "constant_with_warmup",
- ],
- )
- parser.add_argument(
- "--num_warmup_steps",
- type=int,
- default=0,
- help="Number of steps for the warmup in the lr scheduler.",
- )
- parser.add_argument(
- "--output_dir", type=str, default=None, help="Where to store the final model."
- )
- parser.add_argument(
- "--seed", type=int, default=None, help="A seed for reproducible training."
- )
- parser.add_argument(
- "--model_type",
- type=str,
- default=None,
- help="Model type to use if training from scratch.",
- choices=MODEL_TYPES,
- )
- parser.add_argument(
- "--label_all_tokens",
- action="store_true",
- help="Setting labels of all special tokens to -100 and thus PyTorch will ignore them.",
- )
- parser.add_argument(
- "--return_entity_level_metrics",
- action="store_true",
- help="Indication whether entity level metrics are to be returner.",
- )
- parser.add_argument(
- "--task_name",
- type=str,
- default="ner",
- choices=["ner", "pos", "chunk"],
- help="The name of the task.",
- )
- parser.add_argument(
- "--debug",
- action="store_true",
- help="Activate debug mode and run training only with a subset of data.",
- )
- args = parser.parse_args()
-
- # Sanity checks
- if (
- args.task_name is None
- and args.train_file is None
- and args.validation_file is None
- ):
- raise ValueError("Need either a task name or a training/validation file.")
- else:
- if args.train_file is not None:
- extension = args.train_file.split(".")[-1]
- assert extension in [
- "csv",
- "json",
- ], "`train_file` should be a csv or a json file."
- if args.validation_file is not None:
- extension = args.validation_file.split(".")[-1]
- assert extension in [
- "csv",
- "json",
- ], "`validation_file` should be a csv or a json file."
-
- if args.output_dir is not None:
- os.makedirs(args.output_dir, exist_ok=True)
-
- return args
-
-
-def main():
- args = parse_args()
-
- # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
- accelerator = Accelerator()
- # Make one log on every process with the configuration for debugging.
- logging.basicConfig(
- format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
- datefmt="%m/%d/%Y %H:%M:%S",
- level=logging.INFO,
- )
- logger.info(accelerator.state)
-
- # Setup logging, we only want one process per machine to log things on the screen.
- # accelerator.is_local_main_process is only True for one process per machine.
- logger.setLevel(
- logging.INFO if accelerator.is_local_main_process else logging.ERROR
- )
- if accelerator.is_local_main_process:
- datasets.utils.logging.set_verbosity_warning()
- transformers.utils.logging.set_verbosity_info()
- else:
- datasets.utils.logging.set_verbosity_error()
- transformers.utils.logging.set_verbosity_error()
-
- # If passed along, set the training seed now.
- if args.seed is not None:
- set_seed(args.seed)
-
- # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
- # or just provide the name of one of the public datasets for token classification task available on the hub at https://huggingface.co/datasets/
- # (the dataset will be downloaded automatically from the datasets Hub).
- #
- # For CSV/JSON files, this script will use the column called 'tokens' or the first column if no column called
- # 'tokens' is found. You can easily tweak this behavior (see below).
- #
- # In distributed training, the load_dataset function guarantee that only one local process can concurrently
- # download the dataset.
- if args.dataset_name is not None:
- # Downloading and loading a dataset from the hub.
- raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
- else:
- data_files = {}
- if args.train_file is not None:
- data_files["train"] = args.train_file
- if args.validation_file is not None:
- data_files["validation"] = args.validation_file
- extension = args.train_file.split(".")[-1]
- raw_datasets = load_dataset(extension, data_files=data_files)
- # Trim a number of training examples
- if args.debug:
- for split in raw_datasets.keys():
- raw_datasets[split] = raw_datasets[split].select(range(100))
- # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
- # https://huggingface.co/docs/datasets/loading_datasets.html.
-
- if raw_datasets["train"] is not None:
- column_names = raw_datasets["train"].column_names
- features = raw_datasets["train"].features
- else:
- column_names = raw_datasets["validation"].column_names
- features = raw_datasets["validation"].features
- text_column_name = "tokens" if "tokens" in column_names else column_names[0]
- label_column_name = (
- f"{args.task_name}_tags"
- if f"{args.task_name}_tags" in column_names
- else column_names[1]
- )
-
- # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the
- # unique labels.
- def get_label_list(labels):
- unique_labels = set()
- for label in labels:
- unique_labels = unique_labels | set(label)
- label_list = list(unique_labels)
- label_list.sort()
- return label_list
-
- if isinstance(features[label_column_name].feature, ClassLabel):
- label_list = features[label_column_name].feature.names
- # No need to convert the labels since they are already ints.
- label_to_id = {i: i for i in range(len(label_list))}
- else:
- label_list = get_label_list(raw_datasets["train"][label_column_name])
- label_to_id = {l: i for i, l in enumerate(label_list)}
- num_labels = len(label_list)
-
- # Load pretrained model and tokenizer
- #
- # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
- # download model & vocab.
- if args.config_name:
- config = AutoConfig.from_pretrained(args.config_name, num_labels=num_labels)
- elif args.model_name_or_path:
- config = AutoConfig.from_pretrained(
- args.model_name_or_path, num_labels=num_labels
- )
- else:
- config = CONFIG_MAPPING[args.model_type]()
- logger.warning("You are instantiating a new config instance from scratch.")
-
- if args.tokenizer_name:
- tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=True)
- elif args.model_name_or_path:
- tokenizer = AutoTokenizer.from_pretrained(
- args.model_name_or_path, use_fast=True
- )
- else:
- raise ValueError(
- "You are instantiating a new tokenizer from scratch. This is not supported by this script."
- "You can do it from another script, save it, and load it from here, using --tokenizer_name."
- )
-
- if args.model_name_or_path:
- model = AutoModelForTokenClassification.from_pretrained(
- args.model_name_or_path,
- from_tf=bool(".ckpt" in args.model_name_or_path),
- config=config,
- )
- else:
- logger.info("Training new model from scratch")
- model = AutoModelForTokenClassification.from_config(config)
-
- model.resize_token_embeddings(len(tokenizer))
-
- # Replace with LightSeq encoder layers.
- args.local_rank = accelerator.local_process_index
- args.fp16 = accelerator.use_fp16
- inject_ls_enc_layer(model, args, config)
-
- # Preprocessing the raw_datasets.
- # First we tokenize all the texts.
- padding = "max_length" if args.pad_to_max_length else False
-
- # Tokenize all texts and align the labels with them.
-
- def tokenize_and_align_labels(examples):
- tokenized_inputs = tokenizer(
- examples[text_column_name],
- max_length=args.max_length,
- padding=padding,
- truncation=True,
- # We use this argument because the texts in our dataset are lists of words (with a label for each word).
- is_split_into_words=True,
- )
-
- labels = []
- for i, label in enumerate(examples[label_column_name]):
- word_ids = tokenized_inputs.word_ids(batch_index=i)
- previous_word_idx = None
- label_ids = []
- for word_idx in word_ids:
- # Special tokens have a word id that is None. We set the label to -100 so they are automatically
- # ignored in the loss function.
- if word_idx is None:
- label_ids.append(-100)
- # We set the label for the first token of each word.
- elif word_idx != previous_word_idx:
- label_ids.append(label_to_id[label[word_idx]])
- # For the other tokens in a word, we set the label to either the current label or -100, depending on
- # the label_all_tokens flag.
- else:
- label_ids.append(
- label_to_id[label[word_idx]] if args.label_all_tokens else -100
- )
- previous_word_idx = word_idx
-
- labels.append(label_ids)
- tokenized_inputs["labels"] = labels
- return tokenized_inputs
-
- processed_raw_datasets = raw_datasets.map(
- tokenize_and_align_labels,
- batched=True,
- remove_columns=raw_datasets["train"].column_names,
- )
-
- train_dataset = processed_raw_datasets["train"]
- eval_dataset = processed_raw_datasets["validation"]
-
- # DataLoaders creation:
- if args.pad_to_max_length:
- # If padding was already done ot max length, we use the default data collator that will just convert everything
- # to tensors.
- data_collator = default_data_collator
- else:
- # Otherwise, `DataCollatorForTokenClassification` will apply dynamic padding for us (by padding to the maximum length of
- # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
- # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
- data_collator = DataCollatorForTokenClassification(
- tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)
- )
-
- train_dataloader = DataLoader(
- train_dataset,
- shuffle=True,
- collate_fn=data_collator,
- batch_size=args.per_device_train_batch_size,
- )
- eval_dataloader = DataLoader(
- eval_dataset,
- collate_fn=data_collator,
- batch_size=args.per_device_eval_batch_size,
- )
-
- # Optimizer
- # Split weights in two groups, one with weight decay and the other not.
- no_decay = ["bias", "LayerNorm.weight"]
- optimizer_grouped_parameters = [
- {
- "params": [
- p
- for n, p in model.named_parameters()
- if not any(nd in n for nd in no_decay)
- ],
- "weight_decay": args.weight_decay,
- },
- {
- "params": [
- p
- for n, p in model.named_parameters()
- if any(nd in n for nd in no_decay)
- ],
- "weight_decay": 0.0,
- },
- ]
- optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
-
- # Use the device given by the `accelerator` object.
- device = accelerator.device
- model.to(device)
-
- # Prepare everything with our `accelerator`.
- model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
- model, optimizer, train_dataloader, eval_dataloader
- )
-
- # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
- # shorter in multiprocess)
-
- # Scheduler and math around the number of training steps.
- num_update_steps_per_epoch = math.ceil(
- len(train_dataloader) / args.gradient_accumulation_steps
- )
- if args.max_train_steps is None:
- args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
- else:
- args.num_train_epochs = math.ceil(
- args.max_train_steps / num_update_steps_per_epoch
- )
-
- lr_scheduler = get_scheduler(
- name=args.lr_scheduler_type,
- optimizer=optimizer,
- num_warmup_steps=args.num_warmup_steps,
- num_training_steps=args.max_train_steps,
- )
-
- # Metrics
- metric = load_metric("seqeval")
-
- def get_labels(predictions, references):
- # Transform predictions and references tensos to numpy arrays
- if device.type == "cpu":
- y_pred = predictions.detach().clone().numpy()
- y_true = references.detach().clone().numpy()
- else:
- y_pred = predictions.detach().cpu().clone().numpy()
- y_true = references.detach().cpu().clone().numpy()
-
- # Remove ignored index (special tokens)
- true_predictions = [
- [label_list[p] for (p, l) in zip(pred, gold_label) if l != -100]
- for pred, gold_label in zip(y_pred, y_true)
- ]
- true_labels = [
- [label_list[l] for (p, l) in zip(pred, gold_label) if l != -100]
- for pred, gold_label in zip(y_pred, y_true)
- ]
- return true_predictions, true_labels
-
- def compute_metrics():
- results = metric.compute()
- if args.return_entity_level_metrics:
- # Unpack nested dictionaries
- final_results = {}
- for key, value in results.items():
- if isinstance(value, dict):
- for n, v in value.items():
- final_results[f"{key}_{n}"] = v
- else:
- final_results[key] = value
- return final_results
- else:
- return {
- "precision": results["overall_precision"],
- "recall": results["overall_recall"],
- "f1": results["overall_f1"],
- "accuracy": results["overall_accuracy"],
- }
-
- # Train!
- total_batch_size = (
- args.per_device_train_batch_size
- * accelerator.num_processes
- * args.gradient_accumulation_steps
- )
-
- logger.info("***** Running training *****")
- logger.info(f" Num examples = {len(train_dataset)}")
- logger.info(f" Num Epochs = {args.num_train_epochs}")
- logger.info(
- f" Instantaneous batch size per device = {args.per_device_train_batch_size}"
- )
- logger.info(
- f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
- )
- logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}")
- logger.info(f" Total optimization steps = {args.max_train_steps}")
- # Only show the progress bar once on each machine.
- progress_bar = tqdm(
- range(args.max_train_steps), disable=not accelerator.is_local_main_process
- )
- completed_steps = 0
-
- for epoch in range(args.num_train_epochs):
- model.train()
- for step, batch in enumerate(train_dataloader):
- outputs = model(**batch)
- loss = outputs.loss
- loss = loss / args.gradient_accumulation_steps
- accelerator.backward(loss)
- if (
- step % args.gradient_accumulation_steps == 0
- or step == len(train_dataloader) - 1
- ):
- optimizer.step()
- lr_scheduler.step()
- optimizer.zero_grad()
- progress_bar.update(1)
- completed_steps += 1
-
- if completed_steps >= args.max_train_steps:
- break
-
- model.eval()
- for step, batch in enumerate(eval_dataloader):
- with torch.no_grad():
- outputs = model(**batch)
- predictions = outputs.logits.argmax(dim=-1)
- labels = batch["labels"]
- if (
- not args.pad_to_max_length
- ): # necessary to pad predictions and labels for being gathered
- predictions = accelerator.pad_across_processes(
- predictions, dim=1, pad_index=-100
- )
- labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)
-
- predictions_gathered = accelerator.gather(predictions)
- labels_gathered = accelerator.gather(labels)
- preds, refs = get_labels(predictions_gathered, labels_gathered)
- metric.add_batch(
- predictions=preds,
- references=refs,
- ) # predictions and preferences are expected to be a nested list of labels, not label_ids
-
- eval_metric = metric.compute()
- # eval_metric = compute_metrics()
- accelerator.print(f"epoch {epoch}:", eval_metric)
-
- if args.output_dir is not None:
- accelerator.wait_for_everyone()
- unwrapped_model = accelerator.unwrap_model(model)
- unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
-
-
-if __name__ == "__main__":
- main()
diff --git a/examples/training/neurst/README.md b/examples/training/neurst/README.md
index bb3ded25..d755155e 100644
--- a/examples/training/neurst/README.md
+++ b/examples/training/neurst/README.md
@@ -3,27 +3,27 @@ This repo contains an example for how to use LightSeq to accerate the training o
First you should install these requirements.
```shell
-pip install subword-nmt pyyaml sacrebleu sacremoses
-git clone https://github.com/moses-smt/mosesdecoder.git
+$ pip install subword-nmt pyyaml sacrebleu sacremoses
+$ git clone https://github.com/moses-smt/mosesdecoder.git
```
Then clone NeurST and switch to lightseq branch.
```shell
-git clone https://github.com/bytedance/neurst.git
-cd neurst/
-git checkout lightseq
-pip install -e .
+$ git clone https://github.com/bytedance/neurst.git
+$ cd neurst/
+$ git checkout lightseq
+$ pip install -e .
```
Install lightseq
```shell
-pip install http://sf3-ttcdn-tos.pstatp.com/obj/nlp-opensource/lightseq/tensorflow/lightseq_tf-2.0.1-cp37-cp37m-linux_x86_64.whl
+$ pip install http://sf3-ttcdn-tos.pstatp.com/obj/nlp-opensource/lightseq/tensorflow/lightseq_tf-2.0.1-cp37-cp37m-linux_x86_64.whl
```
Download and preprocess data
```shell
-./examples/translation/prepare-wmt14en2de-bpe.sh ../mosesdecoder
+$ ./examples/translation/prepare-wmt14en2de-bpe.sh ../mosesdecoder
```
Traing the model
```shell
-python3 -m neurst.cli.run_exp \
+$ python3 -m neurst.cli.run_exp \
--config_paths wmt14_en_de/training_args.yml,wmt14_en_de/translation_bpe.yml \
--hparams_set transformer_base \
--model_dir wmt14_en_de/benchmark_base \
diff --git a/examples/triton_backend/README.md b/examples/triton_backend/README.md
index c4eed7d1..2ab191da 100644
--- a/examples/triton_backend/README.md
+++ b/examples/triton_backend/README.md
@@ -21,11 +21,11 @@
- The meaning of parameters in config.pbtxt, more information you can find in [Model config of tritonbackend](https://github.com/triton-inference-server/common/blob/main/protobuf/model_config.proto)
- > ${name}: name of model,**which should be same with **
+ > ${name}: name of model, **which should be same with **
>
- > ${backend}: **fixed value - "lightseq"**,which is used to recognize the dynamic link library of tritonbackend, libtriton_lightseq.so
+ > ${backend}: **fixed value - "lightseq"**, which is used to recognize the dynamic link library of tritonbackend, libtriton_lightseq.so
>
- > ${default_model_filename}: name of model file,**which should be same with **
+ > ${default_model_filename}: name of model file, **which should be same with **
>
> ${parameters - value - string_value}: the type of model, which should be supported by lightseq. You can choose `Transformer`|`QuantTransformer`|`Bert`|`Gpt`|`Moe`
diff --git a/lightseq/inference/README.md b/lightseq/inference/README.md
index 24b2bfad..0819db21 100644
--- a/lightseq/inference/README.md
+++ b/lightseq/inference/README.md
@@ -65,15 +65,15 @@ More results is available [here](../../docs/inference/performance.md).
We provide an end2end bart-base example to see how fast Lightseq is compared to HuggingFace. First you should install these requirements.
```shell
-pip install torch tensorflow transformers lightseq
-cd examples/inference/python
+$ pip install torch tensorflow transformers lightseq
+$ cd examples/inference/python
```
then you can check the performance by simply running following commands. `hf_bart_export.py` is used to transform pytorch weights to LightSeq protobuffer.
```shell
-python export/hf_bart_export.py
-python test/ls_bart.py
+$ python export/huggingface/hf_bart_export.py
+$ python test/ls_bart.py
```
on our Tesla V100 we can get following output, 10x speedup have been obtained from running LightSeq rather than HuggingFace.
@@ -97,7 +97,7 @@ Nothing's gonna change my love for you.
Drop everything now. Meet me in the pouring rain. Kiss me on the sidewalk.
```
-LightSeq installation from pypi only supports python 3.6 to 3.8 on Linux for now. Consider compiling from source if you have other environments.
+LightSeq installation from PyPI only supports python 3.6 to 3.8 on Linux for now. Consider compiling from source if you have other environments.
And there is also a quick start for huggingface GPT in examples.
@@ -108,8 +108,8 @@ We provide python api to call lightseq, all you need is to install `lightseq` wi
And check these files `lightseq/inference/proto/*.proto` to prepare your model weights. We provide an example weight file for you to test.
```shell
-curl -OL https://github.com/bytedance/lightseq/releases/download/v0.0.1/transformer_weight.tar.gz
-tar -zxvf transformer_weight.tar.gz
+$ curl -OL https://github.com/bytedance/lightseq/releases/download/v0.0.1/transformer_weight.tar.gz
+$ tar -zxvf transformer_weight.tar.gz
```
Finally you can run lightseq in only a few lines!
@@ -138,12 +138,12 @@ To avoid problems caused by inconsistent environments, you can use the pre-built
[nvidia-docker](https://github.com/NVIDIA/nvidia-docker) and make your GPU driver version >= 410.48
```shell
-docker pull nvcr.io/nvidia/tensorrtserver:19.05-py3
+$ docker pull nvcr.io/nvidia/tensorrtserver:19.05-py3
#
-docker run --gpus '"device=0"' -it --rm -p8000:8000 -p8001:8001 -p8002:8002 -v
+$ docker run --gpus '"device=0"' -it --rm -p8000:8000 -p8001:8001 -p8002:8002 -v
/${current}/${path}:/quick_start nvcr.io/nvidia/tensorrtserver:19.05-py3 /bin/bash
# inside container
-cd /quick_start
+$ cd /quick_start
```
### Use our pre-build lib
@@ -154,8 +154,8 @@ version, we will upload binary executable example and dynamic link library of mo
custom backend of TRTIS.
```shell
-wget https://github.com/bytedance/lightseq/releases/download/${VERSION}/${VERSION}_libs.tar.gz
-tar -zxvf ${VERSION}_libs.tar.gz
+$ wget https://github.com/bytedance/lightseq/releases/download/${VERSION}/${VERSION}_libs.tar.gz
+$ tar -zxvf ${VERSION}_libs.tar.gz
```
### Run local inference demo
@@ -164,12 +164,12 @@ To run local inference demo, you need to prepare model weights saved in custom p
LightSeq and input token ids. We provide a GPT-LM model and its corresponding input token ids:
```shell
-wget https://github.com/bytedance/lightseq/releases/download/v0.0.1/v0.0.1_gptlm.pkg.tar.gz
-tar -zxvf v0.0.1_gptlm.pkg.tar.gz
+$ wget https://github.com/bytedance/lightseq/releases/download/v0.0.1/v0.0.1_gptlm.pkg.tar.gz
+$ tar -zxvf v0.0.1_gptlm.pkg.tar.gz
# fp32 example
-./{VERSION}_libs/gptlm_example.fp32 ./v0.0.1_gptlm.pkg/gpt.pb ./v0.0.1_gptlm.pkg/test_case
+$ ./{VERSION}_libs/gptlm_example.fp32 ./v0.0.1_gptlm.pkg/gpt.pb ./v0.0.1_gptlm.pkg/test_case
# fp16 example
-./{VERSION}_libs/gptlm_example.fp16 ./v0.0.1_gptlm.pkg/gpt.pb ./v0.0.1_gptlm.pkg/test_case
+$ ./{VERSION}_libs/gptlm_example.fp16 ./v0.0.1_gptlm.pkg/gpt.pb ./v0.0.1_gptlm.pkg/test_case
```
To run the end-to-end model server based on TRTIS, you need to prepare a custom backend [model
@@ -187,15 +187,15 @@ models/
With the pre-built libraries and example weights mentioned above, you can easily run a server:
```shell
-mkdir -p ./model_zoo/gptlm/1
-wget https://github.com/bytedance/lightseq/releases/download/v0.0.1/v0.0.1_gptlm.config.pbtxt
-mv v0.0.1_gptlm.config.pbtxt model_zoo/gptlm/config.pbtxt
-cp ./v0.0.1_gptlm.pkg/gpt.pb model_zoo/gptlm/gpt.pb
-cp ./{VERSION}_libs/libgptlm.so.fp32 model_zoo/gptlm/1/libgptlm.so
+$ mkdir -p ./model_zoo/gptlm/1
+$ wget https://github.com/bytedance/lightseq/releases/download/v0.0.1/v0.0.1_gptlm.config.pbtxt
+$ mv v0.0.1_gptlm.config.pbtxt model_zoo/gptlm/config.pbtxt
+$ cp ./v0.0.1_gptlm.pkg/gpt.pb model_zoo/gptlm/gpt.pb
+$ cp ./{VERSION}_libs/libgptlm.so.fp32 model_zoo/gptlm/1/libgptlm.so
# or fp16 server
# cp ./{VERSION}_libs/libgptlm.so.fp16 model_zoo/gptlm/1/libgptlm.so
-export MODEL_ZOO="/quick_start/model_zoo"
-trtserver --model-store=${MODEL_ZOO}
+$ export MODEL_ZOO="/quick_start/model_zoo"
+$ trtserver --model-store=${MODEL_ZOO}
```
After starting server, Invoking the [TRTIS
diff --git a/lightseq/inference/kernels/CMakeLists.txt b/lightseq/inference/kernels/CMakeLists.txt
index 5f647bcd..b9cebb32 100644
--- a/lightseq/inference/kernels/CMakeLists.txt
+++ b/lightseq/inference/kernels/CMakeLists.txt
@@ -2,6 +2,7 @@ cmake_minimum_required(VERSION 3.18)
set(cuda_kernel_files
gptKernels.cc.cu
+ gptKernels_int8.cc.cu
transformerKernels.cc.cu
multilgKernels.cc.cu
embKernels.cc.cu
diff --git a/lightseq/inference/kernels/embKernels_int8.cc.cu b/lightseq/inference/kernels/embKernels_int8.cc.cu
index bade6241..28251303 100644
--- a/lightseq/inference/kernels/embKernels_int8.cc.cu
+++ b/lightseq/inference/kernels/embKernels_int8.cc.cu
@@ -14,7 +14,8 @@ template
__global__ void ker_enc_emb_i8I(const int8_t *token_emb, const T *pos_emb,
const int *tokens, T *output, int *pad_mask,
int pad_id, int batch_size, int seq_len,
- int hidden_dim, float dequant_scale) {
+ int hidden_dim, float dequant_scale,
+ bool scaled) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= batch_size * seq_len * hidden_dim) {
return;
@@ -39,7 +40,8 @@ __global__ void ker_enc_emb_i8I(const int8_t *token_emb, const T *pos_emb,
}
char4 value_i4 = ((char4 *)token_emb)[token * hidden_dim + dim_idx];
float4 pemb = ((float4 *)pos_emb)[seq_idx * hidden_dim + dim_idx];
- float scale = dequant_scale * sqrtf(hidden_dim << 2);
+ float scale = dequant_scale;
+ if (scaled) scale *= sqrtf(hidden_dim << 2);
value.x = float(value_i4.x) * scale + pemb.x;
value.y = float(value_i4.y) * scale + pemb.y;
value.z = float(value_i4.z) * scale + pemb.z;
@@ -49,12 +51,10 @@ __global__ void ker_enc_emb_i8I(const int8_t *token_emb, const T *pos_emb,
}
template <>
-__global__ void ker_enc_emb_i8I<__half>(const int8_t *token_emb,
- const __half *pos_emb,
- const int *tokens, __half *output,
- int *pad_mask, int pad_id,
- int batch_size, int seq_len,
- int hidden_dim, float dequant_scale) {
+__global__ void ker_enc_emb_i8I<__half>(
+ const int8_t *token_emb, const __half *pos_emb, const int *tokens,
+ __half *output, int *pad_mask, int pad_id, int batch_size, int seq_len,
+ int hidden_dim, float dequant_scale, bool scaled) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= batch_size * seq_len * hidden_dim) {
return;
@@ -82,7 +82,8 @@ __global__ void ker_enc_emb_i8I<__half>(const int8_t *token_emb,
__half2 *value_h2 = (__half2 *)(&value);
char2 *value_i2 = (char2 *)(&value_i8);
__half2 *pemb_h2 = (__half2 *)(&pemb);
- float scale = dequant_scale * sqrtf(hidden_dim << 3);
+ float scale = dequant_scale;
+ if (scaled) scale *= sqrtf(hidden_dim << 3);
#pragma unroll
for (int i = 0; i < 4; i++) {
float2 value_f2;
@@ -101,7 +102,7 @@ void launch_enc_emb_i8I(const int8_t *token_emb, const T *pos_emb,
int batch_size, int seq_len, int hidden_dim,
cudaStream_t stream, const T *lang_emb,
const int *lang_id, int multilg_type,
- float dequant_scale) {
+ float dequant_scale, bool scaled) {
if (hidden_dim % 4 != 0) {
throw std::runtime_error("violate hidden_dim % 4 = 0");
}
@@ -111,7 +112,7 @@ void launch_enc_emb_i8I(const int8_t *token_emb, const T *pos_emb,
if (multilg_type == 0) {
ker_enc_emb_i8I<<>>(
token_emb, pos_emb, tokens, output, pad_mask, pad_id, batch_size,
- seq_len, hidden_dim, dequant_scale);
+ seq_len, hidden_dim, dequant_scale, scaled);
} else {
throw std::runtime_error("multilingle not supported");
}
@@ -124,7 +125,7 @@ void launch_enc_emb_i8I<__half>(const int8_t *token_emb, const __half *pos_emb,
int seq_len, int hidden_dim,
cudaStream_t stream, const __half *lang_emb,
const int *lang_id, int multilg_type,
- float dequant_scale) {
+ float dequant_scale, bool scaled) {
if (hidden_dim % 8 != 0) {
throw std::runtime_error("violate hidden_dim % 8 = 0");
}
@@ -135,7 +136,7 @@ void launch_enc_emb_i8I<__half>(const int8_t *token_emb, const __half *pos_emb,
if (multilg_type == 0) {
ker_enc_emb_i8I<__half><<>>(
token_emb, pos_emb, tokens, output, pad_mask, pad_id, batch_size,
- seq_len, hidden_dim, dequant_scale);
+ seq_len, hidden_dim, dequant_scale, scaled);
} else {
throw std::runtime_error("multilingle not supported");
}
@@ -145,13 +146,13 @@ template void launch_enc_emb_i8I(
const int8_t *token_emb, const float *pos_emb, const int *tokens,
float *output, int *pad_mask, int pad_id, int batch_size, int seq_len,
int hidden_dim, cudaStream_t stream, const float *lang_emb,
- const int *lang_id, int multilg_type, float dequant_scale);
+ const int *lang_id, int multilg_type, float dequant_scale, bool scaled);
template void launch_enc_emb_i8I<__half>(
const int8_t *token_emb, const __half *pos_emb, const int *tokens,
__half *output, int *pad_mask, int pad_id, int batch_size, int seq_len,
int hidden_dim, cudaStream_t stream, const __half *lang_emb,
- const int *lang_id, int multilg_type, float dequant_scale);
+ const int *lang_id, int multilg_type, float dequant_scale, bool scaled);
template
__global__ void ker_dec_emb_i8I(const int8_t *token_emb, const T *pos_emb,
@@ -159,7 +160,7 @@ __global__ void ker_dec_emb_i8I(const int8_t *token_emb, const T *pos_emb,
const int *lang_id, T *output, int batch_size,
int beam_size, int hidden_dim, int vocab_size,
int step, int max_step, int multilg_type,
- float dequant_scale) {
+ float dequant_scale, bool scaled) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= batch_size * beam_size * hidden_dim) {
return;
@@ -170,8 +171,10 @@ __global__ void ker_dec_emb_i8I(const int8_t *token_emb, const T *pos_emb,
int8_t emb;
int token = tokens[flat_3dim(batch_idx, beam_idx, step, beam_size, max_step)];
emb = token_emb[flat_2dim(dim_idx, token, vocab_size)];
- float value = float(emb) * dequant_scale * sqrtf(hidden_dim) +
- float(pos_emb[flat_2dim(step, dim_idx, hidden_dim)]);
+ float scale = dequant_scale;
+ if (scaled) scale *= sqrtf(hidden_dim);
+ float value =
+ float(emb) * scale + float(pos_emb[flat_2dim(step, dim_idx, hidden_dim)]);
output[idx] = T(value);
}
@@ -181,7 +184,7 @@ void launch_dec_emb_i8I(const int8_t *token_emb, const T *pos_emb, int *tokens,
int batch_size, int beam_size, int hidden_dim,
int vocab_size, int step, int max_step,
int multilg_type, cudaStream_t stream,
- float dequant_scale) {
+ float dequant_scale, bool scaled) {
if (step >= max_step) {
throw std::runtime_error("violate step < max_step");
}
@@ -193,19 +196,19 @@ void launch_dec_emb_i8I(const int8_t *token_emb, const T *pos_emb, int *tokens,
ker_dec_emb_i8I<<>>(
token_emb, pos_emb, tokens, lang_emb, lang_id, output, batch_size,
beam_size, hidden_dim, vocab_size, step, max_step, multilg_type,
- dequant_scale);
+ dequant_scale, scaled);
}
template void launch_dec_emb_i8I(
const int8_t *token_emb, const float *pos_emb, int *tokens,
const float *lang_emb, const int *lang_id, float *output, int batch_size,
int beam_size, int hidden_dim, int vocab_size, int step, int max_step,
- int multilg_type, cudaStream_t stream, float dequant_scale);
+ int multilg_type, cudaStream_t stream, float dequant_scale, bool scaled);
template void launch_dec_emb_i8I<__half>(
const int8_t *token_emb, const __half *pos_emb, int *tokens,
const __half *lang_emb, const int *lang_id, __half *output, int batch_size,
int beam_size, int hidden_dim, int vocab_size, int step, int max_step,
- int multilg_type, cudaStream_t stream, float dequant_scale);
+ int multilg_type, cudaStream_t stream, float dequant_scale, bool scaled);
} // namespace cuda
} // namespace lightseq
diff --git a/lightseq/inference/kernels/embKernels_int8.h b/lightseq/inference/kernels/embKernels_int8.h
index 6ec8fde1..a914f9f1 100644
--- a/lightseq/inference/kernels/embKernels_int8.h
+++ b/lightseq/inference/kernels/embKernels_int8.h
@@ -11,7 +11,7 @@ void launch_enc_emb_i8I(const int8_t *token_emb, const T *pos_emb,
int batch_size, int seq_len, int hidden_dim,
cudaStream_t stream, const T *lang_emb,
const int *lang_id, int multilg_type,
- float dequant_scale);
+ float dequant_scale, bool scaled = true);
template
void launch_dec_emb_i8I(const int8_t *token_emb, const T *pos_emb, int *tokens,
@@ -19,7 +19,7 @@ void launch_dec_emb_i8I(const int8_t *token_emb, const T *pos_emb, int *tokens,
int batch_size, int beam_size, int hidden_dim,
int vocab_size, int step, int max_step,
int multilg_type, cudaStream_t stream,
- float dequant_scale);
+ float dequant_scale, bool scaled = true);
} // namespace cuda
} // namespace lightseq
diff --git a/lightseq/inference/kernels/gptKernels_int8.cc.cu b/lightseq/inference/kernels/gptKernels_int8.cc.cu
new file mode 100644
index 00000000..286193f2
--- /dev/null
+++ b/lightseq/inference/kernels/gptKernels_int8.cc.cu
@@ -0,0 +1,866 @@
+#include
+
+#include "common.h"
+#include "gptKernels_int8.h"
+#include "transformerKernels.h"
+/**
+@file
+Implemented the cuda kernel function and its launcher
+that required by GPT model.
+Currently, fp16 and fp32 versions are provided
+*/
+namespace lightseq {
+namespace cuda {
+__forceinline__ __device__ int8_t float2int8(float x, float quant_scale) {
+ float i8_f = x * quant_scale;
+ int32_t i8 = floorf(i8_f + 0.5);
+ i8 = i8 < -127 ? -127 : (i8 > 127 ? 127 : i8);
+ return int8_t(i8);
+}
+
+template
+__global__ void ker_gpt_embedding_int8(const int8_t* token_emb,
+ const T* pos_emb, const int* token_id,
+ T* output, int* real_seq_len,
+ int padding_id, int pos_offset,
+ float dequant_scale) {
+ int target_pos = blockIdx.x * gridDim.y + blockIdx.y;
+ int tid = token_id[target_pos];
+ if (tid == padding_id) {
+ // for padding id
+ output[target_pos * blockDim.x + threadIdx.x] = 0.f;
+ return;
+ }
+ if (threadIdx.x == 0) {
+ atomicAdd(real_seq_len + blockIdx.x, 1);
+ }
+ output[target_pos * blockDim.x + threadIdx.x] =
+ T(token_emb[tid * blockDim.x + threadIdx.x]) * dequant_scale +
+ pos_emb[(blockIdx.y + pos_offset) * blockDim.x + threadIdx.x];
+}
+
+/* fp16 version */
+template <>
+__global__ void ker_gpt_embedding_int8<__half>(
+ const int8_t* token_emb, const __half* pos_emb, const int* token_id,
+ __half* output, int* real_seq_len, int padding_id, int pos_offset,
+ float dequant_scale) {
+ int target_pos = blockIdx.x * gridDim.y + blockIdx.y;
+ int tid = token_id[target_pos];
+ half2* output_h = (half2*)output;
+
+ if (tid == padding_id) {
+ // for padding id
+ output_h[target_pos * blockDim.x + threadIdx.x] = __float2half2_rn(0.f);
+ return;
+ }
+ if (threadIdx.x == 0) {
+ atomicAdd(real_seq_len + blockIdx.x, 1);
+ }
+
+ float2 te;
+ char2 cte = ((const char2*)token_emb)[tid * blockDim.x + threadIdx.x];
+ float2 pe = __half22float2(
+ ((const half2*)
+ pos_emb)[(blockIdx.y + pos_offset) * blockDim.x + threadIdx.x]);
+ te.x = float(cte.x) * dequant_scale + pe.x;
+ te.y = float(cte.y) * dequant_scale + pe.y;
+ output_h[target_pos * blockDim.x + threadIdx.x] = __float22half2_rn(te);
+}
+
+template
+void ker_gpt_embedding_i8I_launcher(int batch_size, int batch_seq_len,
+ int hidden_size, cudaStream_t stream,
+ const int8_t* token_emb, const T* pos_emb,
+ const int* token_id, T* output,
+ int* real_seq_len, int padding_id,
+ int pos_offset, float dequant_scale) {
+ ker_gpt_embedding_int8
+ <<>>(
+ token_emb, pos_emb, token_id, output, real_seq_len, padding_id,
+ pos_offset, dequant_scale);
+}
+
+template <>
+void ker_gpt_embedding_i8I_launcher<__half>(
+ int batch_size, int batch_seq_len, int hidden_size, cudaStream_t stream,
+ const int8_t* token_emb, const __half* pos_emb, const int* token_id,
+ __half* output, int* real_seq_len, int padding_id, int pos_offset,
+ float dequant_scale) {
+ ker_gpt_embedding_int8<__half>
+ <<>>(
+ token_emb, pos_emb, token_id, output, real_seq_len, padding_id,
+ pos_offset, dequant_scale);
+}
+
+template void ker_gpt_embedding_i8I_launcher(
+ int batch_size, int batch_seq_len, int hidden_size, cudaStream_t stream,
+ const int8_t* token_emb, const float* pos_emb, const int* token_id,
+ float* output, int* real_seq_len, int padding_id, int pos_offset,
+ float dequant_scale);
+
+template void ker_gpt_embedding_i8I_launcher<__half>(
+ int batch_size, int batch_seq_len, int hidden_size, cudaStream_t stream,
+ const int8_t* token_emb, const __half* pos_emb, const int* token_id,
+ __half* output, int* real_seq_len, int padding_id, int pos_offset,
+ float dequant_scale);
+
+__global__ void ker_ppl_i8I(const int8_t* logits, const int* input_ids,
+ const int* real_seq_len, float* ppl, int vocab_size,
+ float dequant_scale, bool in_col32) {
+ int seq_len = real_seq_len[blockIdx.x]; // remove "eos"
+ if (blockIdx.y >= seq_len - 1) {
+ // will not contribute to ppl
+ return;
+ }
+
+ int token_idx_in_batch = blockIdx.x * gridDim.y + blockIdx.y;
+ int left_logit_idx = token_idx_in_batch * vocab_size + threadIdx.x;
+ int right_logit_idx = (token_idx_in_batch + 1) * vocab_size;
+ /*
+ step 1. find max logit over the whole vocab
+ */
+ float max_logit = CUDA_FLOAT_INF_NEG;
+ for (int idx = left_logit_idx; idx < right_logit_idx; idx += blockDim.x) {
+ int logits_idx;
+ if (in_col32) {
+ int row_id = token_idx_in_batch;
+ int col_id = idx - token_idx_in_batch * vocab_size;
+ logits_idx = row_major2flat_col32(row_id, col_id, gridDim.x * gridDim.y,
+ vocab_size);
+ } else {
+ logits_idx = idx;
+ }
+ max_logit = fmaxf(max_logit, (float)logits[logits_idx] * dequant_scale);
+ }
+ max_logit = blockReduceMax(max_logit);
+ __shared__ float s_max_logit;
+ if (threadIdx.x == 0) {
+ s_max_logit = max_logit;
+ }
+ __syncthreads();
+
+ /*
+ step 2. compute the log probability for the given token,
+ add it to the sequence's ppl
+ */
+ float sum_exp_logit = 0.f;
+ for (int idx = left_logit_idx; idx < right_logit_idx; idx += blockDim.x) {
+ int logits_idx;
+ if (in_col32) {
+ int row_id = token_idx_in_batch;
+ int col_id = idx - token_idx_in_batch * vocab_size;
+ logits_idx = row_major2flat_col32(row_id, col_id, gridDim.x * gridDim.y,
+ vocab_size);
+ } else {
+ logits_idx = idx;
+ }
+ float lgt = fmaxf((float)logits[logits_idx] * dequant_scale - s_max_logit,
+ logit_thresh_min);
+ sum_exp_logit += expf(lgt);
+ }
+ sum_exp_logit = blockReduceSum(sum_exp_logit);
+
+ if (threadIdx.x == 0) {
+ int token_id = input_ids[token_idx_in_batch + 1];
+ int logits_idx;
+ if (in_col32) {
+ int row_id = token_idx_in_batch;
+ int col_id = token_id;
+ logits_idx = row_major2flat_col32(row_id, col_id, gridDim.x * gridDim.y,
+ vocab_size);
+ } else {
+ logits_idx = token_idx_in_batch * vocab_size + token_id;
+ }
+ float log_prob = ((float)logits[logits_idx] * dequant_scale - s_max_logit -
+ logf(sum_exp_logit)) /
+ (float)(seq_len - 1);
+ atomicAdd(ppl + blockIdx.x, -log_prob);
+ }
+}
+
+void ker_ppl_i8I_launcher(int batch_size, int batch_seq_len,
+ int max_thread_per_block, cudaStream_t stream,
+ const int8_t* logits, const int* input_ids,
+ const int* real_seq_len, float* ppl, int vocab_size,
+ float dequant_scale, bool in_col32) {
+ ker_ppl_i8I<<>>(logits, input_ids, real_seq_len, ppl, vocab_size,
+ dequant_scale, in_col32);
+}
+
+template
+__global__ void ker_correlation_softmax_gpt_i32I(
+ int32_t* correlation, T* output, const int* real_seq_len,
+ const int batch_seq_len, float attn_scale, float dequant_scale) {
+ int query_token_pos = blockIdx.y % batch_seq_len;
+ if (query_token_pos >= real_seq_len[blockIdx.x]) {
+ return;
+ }
+
+ int mask = 0; // can see the token when mask=0
+ if (threadIdx.x > query_token_pos || threadIdx.x >= batch_seq_len) {
+ mask = 1; // Can only see the token on the left side of it
+ }
+
+ int idx = (blockIdx.x * gridDim.y + blockIdx.y) * batch_seq_len + threadIdx.x;
+ float val = threadIdx.x < batch_seq_len
+ ? ((float)correlation[idx] * attn_scale * dequant_scale *
+ dequant_scale)
+ : CUDA_FLOAT_INF_NEG;
+ float max_val = blockReduceMax(mask ? CUDA_FLOAT_INF_NEG : val);
+ __shared__ float smax;
+ if (threadIdx.x == 0) smax = max_val;
+ __syncthreads();
+
+ val = mask ? 0.f : expf(val - smax);
+ float rsum = blockReduceSum(val);
+ __shared__ float ssum;
+ if (threadIdx.x == 0) ssum = rsum;
+ __syncthreads();
+
+ if (threadIdx.x < batch_seq_len) output[idx] = (T)(val / ssum);
+}
+
+template
+void ker_correlation_softmax_gpt_i32I_launcher(
+ int batch_size, int batch_seq_len, int head_num, cudaStream_t stream,
+ int32_t* correlation, T* output, const int* real_seq_len, float attn_scale,
+ float dequant_scale) {
+ int block_dim = batch_seq_len;
+ if (batch_seq_len < 1024) {
+ block_dim = (batch_seq_len + 31) >> 5;
+ block_dim *= 32;
+ }
+
+ ker_correlation_softmax_gpt_i32I
+ <<>>(
+ correlation, output, real_seq_len, batch_seq_len, attn_scale,
+ dequant_scale);
+}
+
+template void ker_correlation_softmax_gpt_i32I_launcher(
+ int batch_size, int batch_seq_len, int head_num, cudaStream_t stream,
+ int32_t* correlation, float* output, const int* real_seq_len,
+ float attn_scale, float dequant_scale);
+
+template void ker_correlation_softmax_gpt_i32I_launcher<__half>(
+ int batch_size, int batch_seq_len, int head_num, cudaStream_t stream,
+ int32_t* correlation, __half* output, const int* real_seq_len,
+ float attn_scale, float dequant_scale);
+
+template
+__global__ void ker_topk_sample_i8I(const int8_t* logits, int* old_input_ids,
+ int* new_input_ids, const int* real_seq_len,
+ const int vocab_size,
+ const int batch_seq_len, int logits_seq_len,
+ int* unfinished, curandState* curandstate,
+ int eos_id, float dequant_scale,
+ bool in_col32) {
+ int last_token_idx_in_batch = blockIdx.x * batch_seq_len + batch_seq_len - 1;
+
+ /* add EOS to end if last token is EOS */
+ if (old_input_ids[last_token_idx_in_batch] == eos_id) {
+ int left_token_idx = blockIdx.x * batch_seq_len + threadIdx.x;
+ int right_token_idx = (blockIdx.x + 1) * batch_seq_len;
+ for (int idx = left_token_idx; idx < right_token_idx; idx += blockDim.x) {
+ int new_idx = idx + blockIdx.x;
+ new_input_ids[new_idx] = old_input_ids[idx];
+ }
+ if (threadIdx.x == 0) {
+ // blockIdx.x * (batch_seq_len+1) + batch_seq_len
+ new_input_ids[(blockIdx.x + 1) * (batch_seq_len + 1) - 1] = eos_id;
+ old_input_ids[gridDim.x * batch_seq_len + blockIdx.x] = eos_id;
+ }
+ return;
+ }
+ int logits_token_idx_in_batch =
+ blockIdx.x * logits_seq_len + logits_seq_len - 1;
+ int left_logit_idx = logits_token_idx_in_batch * vocab_size + threadIdx.x;
+ int right_logit_idx = (logits_token_idx_in_batch + 1) * vocab_size;
+
+ /*
+ step1. find max logit and rough Kth logit over the whole vocab
+ */
+ __shared__ float s_max_logit, s_topk_logit;
+ float rough_top_kth_logit = CUDA_FLOAT_INF_NEG;
+ for (int idx = left_logit_idx; idx < right_logit_idx; idx += blockDim.x) {
+ int logits_idx;
+ if (in_col32) {
+ int row_id = logits_token_idx_in_batch;
+ int col_id = idx - logits_token_idx_in_batch * vocab_size;
+ logits_idx = row_major2flat_col32(row_id, col_id,
+ gridDim.x * logits_seq_len, vocab_size);
+ } else {
+ logits_idx = idx;
+ }
+ rough_top_kth_logit =
+ fmaxf(rough_top_kth_logit, (float)logits[logits_idx] * dequant_scale);
+ }
+ float max_logit = blockReduceMax(rough_top_kth_logit);
+ rough_top_kth_logit = blockRoughTopK(rough_top_kth_logit);
+ if (threadIdx.x == 0) {
+ s_topk_logit = rough_top_kth_logit;
+ s_max_logit = max_logit;
+ }
+ __syncthreads();
+
+ __shared__ int s_tid;
+
+ if (k != 1) {
+ /* step2 hold one logit per thread which larger than Kth logit and sample
+ * from them */
+ float topk_exp_sum, topk_exp = CUDA_FLOAT_INF_NEG;
+ int topk_tid = vocab_size;
+ int test_num = 0;
+ __shared__ float s_topk_exp_sum;
+ for (int idx = left_logit_idx; idx < right_logit_idx; idx += blockDim.x) {
+ int logits_idx;
+ if (in_col32) {
+ int row_id = logits_token_idx_in_batch;
+ int col_id = idx - logits_token_idx_in_batch * vocab_size;
+ logits_idx = row_major2flat_col32(
+ row_id, col_id, gridDim.x * logits_seq_len, vocab_size);
+ } else {
+ logits_idx = idx;
+ }
+ float logit = (float)logits[logits_idx] * dequant_scale;
+ float logit_exp = expf(fmaxf(logit - s_max_logit, logit_thresh_min));
+ if (logit >= s_topk_logit) test_num++;
+ if (logit >= s_topk_logit && logit_exp > topk_exp) {
+ topk_exp = logit_exp;
+ topk_tid = idx - left_logit_idx + threadIdx.x;
+ }
+ }
+
+ test_num = blockReduceSum(test_num);
+
+ if (topk_tid == vocab_size) topk_exp = 0;
+ topk_exp_sum = blockReduceSum(topk_exp);
+ if (threadIdx.x == 0) {
+ s_topk_exp_sum = topk_exp_sum;
+ }
+ __syncthreads();
+
+ /* calculate cumulative probability */
+ float topk_prob = topk_exp / s_topk_exp_sum;
+ float prefix_sum_prob;
+ typedef cub::BlockScan BlockScan;
+ __shared__ typename BlockScan::TempStorage temp_storage;
+ BlockScan(temp_storage).InclusiveSum(topk_prob, prefix_sum_prob);
+
+ __shared__ float random_x;
+ if (threadIdx.x == 0) {
+ random_x = curand_uniform(curandstate + blockIdx.x);
+ }
+ __syncthreads();
+
+ if (threadIdx.x == 0) {
+ s_tid = vocab_size;
+ }
+ __syncthreads();
+
+ int threadID = threadIdx.x;
+ __shared__ int s_threadID;
+ __shared__ float s_max_prob;
+ if (random_x > prefix_sum_prob) threadID = blockDim.x;
+ threadID = blockReduceMin(threadID);
+ float max_prob = blockReduceMax(topk_prob);
+ if (threadIdx.x == 0) {
+ s_threadID = threadID;
+ s_max_prob = max_prob;
+ }
+ __syncthreads();
+ if (threadIdx.x == s_threadID) {
+ s_tid = topk_tid;
+ }
+ __syncthreads();
+
+ if (s_tid == vocab_size && topk_prob == s_max_prob) {
+ s_tid = topk_tid;
+ }
+ __syncthreads();
+ } else {
+ s_tid = vocab_size;
+ for (int idx = left_logit_idx; idx < right_logit_idx; idx += blockDim.x) {
+ int logits_idx;
+ if (in_col32) {
+ int row_id = logits_token_idx_in_batch;
+ int col_id = idx - logits_token_idx_in_batch * vocab_size;
+ logits_idx = row_major2flat_col32(
+ row_id, col_id, gridDim.x * logits_seq_len, vocab_size);
+ } else {
+ logits_idx = idx;
+ }
+ float logit = (float)logits[logits_idx] * dequant_scale;
+ if (logit == s_max_logit) {
+ s_tid = idx - left_logit_idx + threadIdx.x;
+ }
+ }
+ __syncthreads();
+ }
+
+ /* if new sampled tid is not EOS, set unfinish TRUE */
+ if (threadIdx.x == 0) {
+ if (s_tid != eos_id) unfinished[0] = 1;
+ }
+
+ /* step3 copy old_input_ids to new_input_ids and add new sampled ids */
+ int left_token_idx = blockIdx.x * batch_seq_len + threadIdx.x;
+ int right_token_idx = (blockIdx.x + 1) * batch_seq_len;
+ for (int idx = left_token_idx; idx < right_token_idx; idx += blockDim.x) {
+ int new_idx = idx + blockIdx.x;
+ new_input_ids[new_idx] = old_input_ids[idx];
+ }
+ if (threadIdx.x == 0) {
+ new_input_ids[(blockIdx.x + 1) * (batch_seq_len + 1) - 1] = s_tid;
+ // save the newly sampled ids to old_input_ids for next step inputs
+ old_input_ids[gridDim.x * batch_seq_len + blockIdx.x] = s_tid;
+ }
+}
+
+void ker_topk_sample_i8I_launcher(int batch_size, int batch_seq_len,
+ int logits_seq_len, int max_thread_per_block,
+ cudaStream_t stream, const int8_t* logits,
+ int* old_input_ids, int* new_input_ids,
+ const int* real_seq_len, const int vocab_size,
+ const int k, int* unfinished,
+ curandState* curandstate, int eos_id,
+ float dequant_scale, bool in_col32) {
+ if (k == 1)
+ ker_topk_sample_i8I<1><<>>(
+ logits, old_input_ids, new_input_ids, real_seq_len, vocab_size,
+ batch_seq_len, logits_seq_len, unfinished, curandstate, eos_id,
+ dequant_scale, in_col32);
+ else if (k == 2)
+ ker_topk_sample_i8I<2><<>>(
+ logits, old_input_ids, new_input_ids, real_seq_len, vocab_size,
+ batch_seq_len, logits_seq_len, unfinished, curandstate, eos_id,
+ dequant_scale, in_col32);
+ else if (k == 4)
+ ker_topk_sample_i8I<4><<>>(
+ logits, old_input_ids, new_input_ids, real_seq_len, vocab_size,
+ batch_seq_len, logits_seq_len, unfinished, curandstate, eos_id,
+ dequant_scale, in_col32);
+ else if (k == 8)
+ ker_topk_sample_i8I<8><<>>(
+ logits, old_input_ids, new_input_ids, real_seq_len, vocab_size,
+ batch_seq_len, logits_seq_len, unfinished, curandstate, eos_id,
+ dequant_scale, in_col32);
+ else if (k == 16)
+ ker_topk_sample_i8I<16><<>>(
+ logits, old_input_ids, new_input_ids, real_seq_len, vocab_size,
+ batch_seq_len, logits_seq_len, unfinished, curandstate, eos_id,
+ dequant_scale, in_col32);
+ else if (k == 32)
+ ker_topk_sample_i8I<32><<>>(
+ logits, old_input_ids, new_input_ids, real_seq_len, vocab_size,
+ batch_seq_len, logits_seq_len, unfinished, curandstate, eos_id,
+ dequant_scale, in_col32);
+ else {
+ throw std::invalid_argument("topk argument should be in [1,2,4,8,16,32]");
+ }
+}
+
+__global__ void ker_topp_sample_i8I(const int8_t* logits, int* old_input_ids,
+ int* new_input_ids, const int* real_seq_len,
+ const int vocab_size,
+ const int batch_seq_len, int logits_seq_len,
+ int* unfinished, float p,
+ curandState* curandstate, int eos_id,
+ float dequant_scale, bool in_col32) {
+ int token_idx_in_batch = blockIdx.x * batch_seq_len + batch_seq_len - 1;
+
+ /* add EOS to end if last token is EOS */
+ if (old_input_ids[token_idx_in_batch] == eos_id) {
+ int left_token_idx = blockIdx.x * batch_seq_len + threadIdx.x;
+ int right_token_idx = (blockIdx.x + 1) * batch_seq_len;
+ for (int idx = left_token_idx; idx < right_token_idx; idx += blockDim.x) {
+ int new_idx = idx + blockIdx.x;
+ new_input_ids[new_idx] = old_input_ids[idx];
+ }
+ if (threadIdx.x == 0) {
+ new_input_ids[(blockIdx.x + 1) * (batch_seq_len + 1) - 1] = eos_id;
+ old_input_ids[gridDim.x * batch_seq_len + blockIdx.x] = eos_id;
+ }
+ return;
+ }
+ int logits_token_idx_in_batch =
+ blockIdx.x * logits_seq_len + logits_seq_len - 1;
+ int left_logit_idx = logits_token_idx_in_batch * vocab_size + threadIdx.x;
+ int right_logit_idx = (logits_token_idx_in_batch + 1) * vocab_size;
+
+ /*
+ step1. find max logit in each thread and sample from these probs with nucleus
+ sampling
+ */
+ __shared__ float s_max_logit;
+ float max_logit = CUDA_FLOAT_INF_NEG;
+ for (int idx = left_logit_idx; idx < right_logit_idx; idx += blockDim.x) {
+ int logits_idx;
+ if (in_col32) {
+ int row_id = logits_token_idx_in_batch;
+ int col_id = idx - logits_token_idx_in_batch * vocab_size;
+ logits_idx = row_major2flat_col32(row_id, col_id,
+ gridDim.x * logits_seq_len, vocab_size);
+ } else {
+ logits_idx = idx;
+ }
+ max_logit = fmaxf(max_logit, (float)logits[logits_idx] * dequant_scale);
+ }
+ float max_logit_array[1];
+ max_logit_array[0] = max_logit;
+ typedef cub::BlockRadixSort BlockRadixSort;
+ __shared__ typename BlockRadixSort::TempStorage sort_temp_storage;
+ BlockRadixSort(sort_temp_storage).SortDescending(max_logit_array);
+ float presum_max_logit_exp;
+ max_logit = max_logit_array[0];
+
+ float block_max_logit = blockReduceMax(max_logit);
+ if (threadIdx.x == 0) {
+ s_max_logit = block_max_logit;
+ }
+ __syncthreads();
+
+ float biased_logit_exp =
+ expf(fmaxf(max_logit - s_max_logit, logit_thresh_min));
+
+ typedef cub::BlockScan BlockScan;
+ __shared__ typename BlockScan::TempStorage presum_temp_storage;
+ BlockScan(presum_temp_storage)
+ .InclusiveSum(biased_logit_exp, presum_max_logit_exp);
+
+ float topp_exp_threshold;
+ if (threadIdx.x == blockDim.x - 1) {
+ topp_exp_threshold = p * presum_max_logit_exp;
+ }
+ __shared__ float s_presum_logit_exp_threshold;
+ if (presum_max_logit_exp > topp_exp_threshold) {
+ presum_max_logit_exp = CUDA_FLOAT_INF_NEG;
+ }
+ float logit_exp_threshold = blockReduceMax(presum_max_logit_exp);
+ if (threadIdx.x == 0) {
+ s_presum_logit_exp_threshold = logit_exp_threshold;
+ }
+ __syncthreads();
+
+ __shared__ float s_logit_threshold;
+ if (presum_max_logit_exp == s_presum_logit_exp_threshold) {
+ s_logit_threshold = max_logit;
+ }
+ __syncthreads();
+
+ /* step2 hold one logit per thread and sample
+ * from them */
+ float topk_exp_sum, topk_exp = CUDA_FLOAT_INF_NEG;
+ int topk_tid = vocab_size;
+ int test_num = 0;
+ __shared__ float s_topk_exp_sum;
+ for (int idx = left_logit_idx; idx < right_logit_idx; idx += blockDim.x) {
+ int logits_idx;
+ if (in_col32) {
+ int row_id = logits_token_idx_in_batch;
+ int col_id = idx - logits_token_idx_in_batch * vocab_size;
+ logits_idx = row_major2flat_col32(row_id, col_id,
+ gridDim.x * logits_seq_len, vocab_size);
+ } else {
+ logits_idx = idx;
+ }
+ float logit = (float)logits[logits_idx] * dequant_scale;
+ float logit_exp = expf(fmaxf(logit - s_max_logit, logit_thresh_min));
+ if (logit >= s_logit_threshold) test_num++;
+ if (logit >= s_logit_threshold && logit_exp > topk_exp) {
+ topk_exp = logit_exp;
+ topk_tid = idx - left_logit_idx + threadIdx.x;
+ }
+ }
+
+ test_num = blockReduceSum(test_num);
+
+ if (topk_tid == vocab_size) topk_exp = 0;
+ topk_exp_sum = blockReduceSum(topk_exp);
+ if (threadIdx.x == 0) {
+ s_topk_exp_sum = topk_exp_sum;
+ }
+ __syncthreads();
+
+ /* calculate cumulative probability */
+ float topk_prob = topk_exp / s_topk_exp_sum;
+ float prefix_sum_prob;
+ BlockScan(presum_temp_storage).InclusiveSum(topk_prob, prefix_sum_prob);
+
+ __shared__ float random_x;
+ if (threadIdx.x == 0) {
+ random_x = curand_uniform(curandstate + blockIdx.x);
+ }
+ __syncthreads();
+
+ __shared__ int s_tid;
+ if (threadIdx.x == 0) {
+ s_tid = vocab_size;
+ }
+ __syncthreads();
+
+ int threadID = threadIdx.x;
+ __shared__ int s_threadID;
+ __shared__ float s_max_prob;
+ if (random_x > prefix_sum_prob) threadID = blockDim.x;
+ threadID = blockReduceMin(threadID);
+ float max_prob = blockReduceMax(topk_prob);
+ if (threadIdx.x == 0) {
+ s_threadID = threadID;
+ s_max_prob = max_prob;
+ }
+ __syncthreads();
+ if (threadIdx.x == s_threadID) {
+ s_tid = topk_tid;
+ }
+ __syncthreads();
+
+ if (s_tid == vocab_size && topk_prob == s_max_prob) {
+ s_tid = topk_tid;
+ }
+ __syncthreads();
+
+ /* if new sampled tid is not EOS, set unfinish TRUE */
+ if (threadIdx.x == 0) {
+ if (s_tid != eos_id) unfinished[0] = 1;
+ }
+
+ /* step3 copy old_input_ids to new_input_ids and add new sampled ids */
+ int left_token_idx = blockIdx.x * batch_seq_len + threadIdx.x;
+ int right_token_idx = (blockIdx.x + 1) * batch_seq_len;
+ for (int idx = left_token_idx; idx < right_token_idx; idx += blockDim.x) {
+ int new_idx = idx + blockIdx.x;
+ new_input_ids[new_idx] = old_input_ids[idx];
+ }
+ if (threadIdx.x == 0) {
+ new_input_ids[(blockIdx.x + 1) * (batch_seq_len + 1) - 1] = s_tid;
+ // save the newly sampled ids to old_input_ids for next step inputs
+ old_input_ids[gridDim.x * batch_seq_len + blockIdx.x] = s_tid;
+ }
+}
+
+void ker_topp_sample_i8I_launcher(int batch_size, int batch_seq_len,
+ int logits_seq_len, int max_thread_per_block,
+ cudaStream_t stream, const int8_t* logits,
+ int* old_input_ids, int* new_input_ids,
+ const int* real_seq_len, const int vocab_size,
+ const float p, int* unfinished,
+ curandState* curandstate, int eos_id,
+ float dequant_scale, bool in_col32) {
+ ker_topp_sample_i8I<<>>(
+ logits, old_input_ids, new_input_ids, real_seq_len, vocab_size,
+ batch_seq_len, logits_seq_len, unfinished, p, curandstate, eos_id,
+ dequant_scale, in_col32);
+}
+
+template
+__global__ void ker_arrange_qkv_with_cache_i8I_i8O(
+ const int8_t* ori_qkv, const T* qkv_bias, int8_t* new_q, int8_t* new_k,
+ int8_t* k_cache, int8_t* new_v, int8_t* v_cache, int batch_seq_len,
+ int dim_per_head, int head_num, float dequant_scale, float quant_scale,
+ bool in_col32) {
+ int hidden_size = head_num * dim_per_head;
+ int batch_size = gridDim.x / batch_seq_len;
+ int batch_id = blockIdx.x / batch_seq_len;
+ int token_id = blockIdx.x % batch_seq_len;
+ int head_id = threadIdx.x / dim_per_head;
+ int dim_id = threadIdx.x % dim_per_head;
+ int target_id = targetid_4dim(batch_id, head_id, token_id, dim_id, head_num,
+ batch_seq_len, dim_per_head);
+ int8_t new_val;
+
+ if (token_id < batch_seq_len - 1) {
+ int old_target_id =
+ targetid_4dim(batch_id, head_id, token_id, dim_id, head_num,
+ batch_seq_len - 1, dim_per_head);
+ if (blockIdx.y == 0) return;
+ if (blockIdx.y == 1) new_val = k_cache[old_target_id];
+ if (blockIdx.y == 2) new_val = v_cache[old_target_id];
+ } else {
+ int qkv_index;
+ if (in_col32) {
+ int row_id = batch_id;
+ int col_id = blockIdx.y * hidden_size + threadIdx.x;
+ qkv_index = row_major2flat_col32(row_id, col_id, batch_size,
+ gridDim.y * hidden_size);
+ } else {
+ qkv_index =
+ (batch_id * gridDim.y + blockIdx.y) * hidden_size + threadIdx.x;
+ }
+ float tmp_val = float(ori_qkv[qkv_index]) * dequant_scale +
+ __ldg(&qkv_bias[blockIdx.y * hidden_size + threadIdx.x]);
+ new_val = float2int8(tmp_val, quant_scale);
+ if (blockIdx.y == 0) {
+ target_id = targetid_4dim(batch_id, head_id, 0, dim_id, head_num, 1,
+ dim_per_head);
+ }
+ }
+
+ if (blockIdx.y == 0) new_q[target_id] = new_val;
+ if (blockIdx.y == 1) new_k[target_id] = new_val;
+ if (blockIdx.y == 2) {
+ new_v[target_id] = new_val;
+ }
+}
+
+template <>
+__global__ void ker_arrange_qkv_with_cache_i8I_i8O<__half>(
+ const int8_t* ori_qkv, const __half* qkv_bias, int8_t* new_q, int8_t* new_k,
+ int8_t* k_cache, int8_t* new_v, int8_t* v_cache, int batch_seq_len,
+ int dim_per_head, int head_num, float dequant_scale, float quant_scale,
+ bool in_col32) {
+ int hidden_size = head_num * dim_per_head;
+ int batch_size = gridDim.x / batch_seq_len;
+ int batch_id = blockIdx.x / batch_seq_len;
+ int token_id = blockIdx.x % batch_seq_len;
+ int head_id = threadIdx.x / dim_per_head;
+ int dim_id = threadIdx.x % dim_per_head;
+ int target_id = targetid_4dim(batch_id, head_id, token_id, dim_id, head_num,
+ batch_seq_len, dim_per_head);
+ int2 new_val;
+ int8_t* p_new_val = (int8_t*)(&new_val);
+ const int2* p_ori_qkv = (const int2*)ori_qkv;
+ const float4* p_bias = (const float4*)qkv_bias;
+ const int2* p_k_cache = (const int2*)k_cache;
+ const int2* p_v_cache = (const int2*)v_cache;
+ int2* p_new_q = (int2*)new_q;
+ int2* p_new_k = (int2*)new_k;
+ int2* p_new_v = (int2*)new_v;
+
+ if (token_id < batch_seq_len - 1) {
+ int old_target_id =
+ targetid_4dim(batch_id, head_id, token_id, dim_id, head_num,
+ batch_seq_len - 1, dim_per_head);
+ if (blockIdx.y == 0) return;
+ if (blockIdx.y == 1) new_val = p_k_cache[old_target_id];
+ if (blockIdx.y == 2) new_val = p_v_cache[old_target_id];
+ } else {
+ int qkv_index;
+ if (in_col32) {
+ int row_id = batch_id;
+ int col_id = (blockIdx.y * hidden_size + threadIdx.x) << 3;
+ qkv_index = row_major2flat_col32(row_id, col_id, batch_size,
+ (gridDim.y * hidden_size) << 3) >>
+ 3;
+ } else {
+ qkv_index =
+ (batch_id * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+ }
+ int2 ori_qkv8 = p_ori_qkv[qkv_index];
+ float4 bias8 = __ldg(&p_bias[blockIdx.y * blockDim.x + threadIdx.x]);
+ int8_t* p_ori_qkv8 = (int8_t*)(&ori_qkv8);
+ __half* p_bias8 = (__half*)(&bias8);
+#pragma unroll
+ for (int i = 0; i < 8; ++i) {
+ p_new_val[i] =
+ float2int8(float(p_ori_qkv8[i]) * dequant_scale + float(p_bias8[i]),
+ quant_scale);
+ }
+ if (blockIdx.y == 0) {
+ target_id = targetid_4dim(batch_id, head_id, 0, dim_id, head_num, 1,
+ dim_per_head);
+ }
+ }
+
+ if (blockIdx.y == 0) p_new_q[target_id] = new_val;
+ if (blockIdx.y == 1) p_new_k[target_id] = new_val;
+ if (blockIdx.y == 2) p_new_v[target_id] = new_val;
+}
+
+template
+void ker_arrange_qkv_with_cache_i8I_i8O_launcher(
+ int batch_token_num, int hidden_size, cudaStream_t stream,
+ const int8_t* ori_qkv, const T* qkv_bias, int8_t* new_q, int8_t* new_k,
+ int8_t* k_cache, int8_t* new_v, int8_t* v_cache, int batch_seq_len,
+ int dim_per_head, int head_num, float dequant_scale, float quant_scale,
+ bool in_col32) {
+ ker_arrange_qkv_with_cache_i8I_i8O
+ <<>>(
+ ori_qkv, qkv_bias, new_q, new_k, k_cache, new_v, v_cache,
+ batch_seq_len, dim_per_head, head_num, dequant_scale, quant_scale,
+ in_col32);
+}
+
+template <>
+void ker_arrange_qkv_with_cache_i8I_i8O_launcher<__half>(
+ int batch_token_num, int hidden_size, cudaStream_t stream,
+ const int8_t* ori_qkv, const __half* qkv_bias, int8_t* new_q, int8_t* new_k,
+ int8_t* k_cache, int8_t* new_v, int8_t* v_cache, int batch_seq_len,
+ int dim_per_head, int head_num, float dequant_scale, float quant_scale,
+ bool in_col32) {
+ ker_arrange_qkv_with_cache_i8I_i8O<__half>
+ <<>>(
+ ori_qkv, qkv_bias, new_q, new_k, k_cache, new_v, v_cache,
+ batch_seq_len, dim_per_head / 8, head_num, dequant_scale, quant_scale,
+ in_col32);
+}
+
+template void ker_arrange_qkv_with_cache_i8I_i8O_launcher(
+ int batch_token_num, int hidden_size, cudaStream_t stream,
+ const int8_t* ori_qkv, const float* qkv_bias, int8_t* new_q, int8_t* new_k,
+ int8_t* k_cache, int8_t* new_v, int8_t* v_cache, int batch_seq_len,
+ int dim_per_head, int head_num, float dequant_scale, float quant_scale,
+ bool in_col32);
+
+template void ker_arrange_qkv_with_cache_i8I_i8O_launcher<__half>(
+ int batch_token_num, int hidden_size, cudaStream_t stream,
+ const int8_t* ori_qkv, const __half* qkv_bias, int8_t* new_q, int8_t* new_k,
+ int8_t* k_cache, int8_t* new_v, int8_t* v_cache, int batch_seq_len,
+ int dim_per_head, int head_num, float dequant_scale, float quant_scale,
+ bool in_col32);
+
+template
+__global__ void ker_attention_mask_weights_i32I(
+ int32_t* correlation, T* output, const int* real_seq_len, int dst_seq_len,
+ int src_seq_len, float attn_scale, float dequant_scale) {
+ int query_token_pos = blockIdx.y % dst_seq_len + src_seq_len - dst_seq_len;
+ if (query_token_pos >= real_seq_len[blockIdx.x]) {
+ return;
+ }
+ int mask = 0; // can see the token when mask=0
+ if (threadIdx.x > query_token_pos) {
+ mask = 1; // Can only see the token on the left side of it
+ }
+
+ int idx = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+ float val =
+ (float)correlation[idx] * attn_scale * dequant_scale * dequant_scale;
+ float max_val = blockReduceMax(mask ? CUDA_FLOAT_INF_NEG : val);
+ __shared__ float smax;
+ if (threadIdx.x == 0) smax = max_val;
+ __syncthreads();
+
+ val = mask ? 0.f : expf(fmaxf(logit_thresh_min, val - smax));
+ float rsum = blockReduceSum(val);
+ __shared__ float ssum;
+ if (threadIdx.x == 0) ssum = rsum;
+ __syncthreads();
+
+ output[idx] = (T)(val / (ssum + epsilon));
+}
+
+template
+void ker_attention_mask_weights_i32I_launcher(
+ int batch_size, int dst_seq_len, int src_seq_len, int head_num,
+ cudaStream_t stream, int32_t* correlation, T* output,
+ const int* real_seq_len, float attn_scale, float dequant_scale) {
+ ker_attention_mask_weights_i32I
+ <<>>(
+ correlation, output, real_seq_len, dst_seq_len, src_seq_len,
+ attn_scale, dequant_scale);
+}
+
+template void ker_attention_mask_weights_i32I_launcher(
+ int batch_size, int dst_seq_len, int src_seq_len, int head_num,
+ cudaStream_t stream, int32_t* correlation, float* output,
+ const int* real_seq_len, float attn_scale, float dequant_scale);
+
+template void ker_attention_mask_weights_i32I_launcher<__half>(
+ int batch_size, int dst_seq_len, int src_seq_len, int head_num,
+ cudaStream_t stream, int32_t* correlation, __half* output,
+ const int* real_seq_len, float attn_scale, float dequant_scale);
+
+} // namespace cuda
+} // namespace lightseq
diff --git a/lightseq/inference/kernels/gptKernels_int8.h b/lightseq/inference/kernels/gptKernels_int8.h
new file mode 100644
index 00000000..007e8e9a
--- /dev/null
+++ b/lightseq/inference/kernels/gptKernels_int8.h
@@ -0,0 +1,63 @@
+#pragma once
+#include
+#include
+#include
+#include
+
+namespace lightseq {
+namespace cuda {
+
+template
+void ker_gpt_embedding_i8I_launcher(int batch_size, int batch_seq_len,
+ int hidden_size, cudaStream_t stream,
+ const int8_t* token_emb, const T* pos_emb,
+ const int* token_id, T* output,
+ int* real_seq_len, int padding_id,
+ int pos_offset, float dequant_scale);
+
+void ker_ppl_i8I_launcher(int batch_size, int batch_seq_len,
+ int max_thread_per_block, cudaStream_t stream,
+ const int8_t* logits, const int* input_ids,
+ const int* real_seq_len, float* ppl, int vocab_size,
+ float dequant_scale, bool in_col32 = false);
+
+template
+void ker_correlation_softmax_gpt_i32I_launcher(
+ int batch_size, int batch_seq_len, int head_num, cudaStream_t stream,
+ int32_t* correlation, T* output, const int* real_seq_len, float attn_scale,
+ float dequant_scale);
+
+void ker_topk_sample_i8I_launcher(int batch_size, int batch_seq_len,
+ int logits_seq_len, int max_thread_per_block,
+ cudaStream_t stream, const int8_t* logits,
+ int* old_input_ids, int* new_input_ids,
+ const int* real_seq_len, const int vocab_size,
+ const int k, int* all_finished,
+ curandState* curandstate, int eos_id,
+ float dequant_scale, bool in_col32 = false);
+
+void ker_topp_sample_i8I_launcher(int batch_size, int batch_seq_len,
+ int logits_seq_len, int max_thread_per_block,
+ cudaStream_t stream, const int8_t* logits,
+ int* old_input_ids, int* new_input_ids,
+ const int* real_seq_len, const int vocab_size,
+ const float p, int* unfinished,
+ curandState* curandstate, int eos_id,
+ float dequant_scale, bool in_col32 = false);
+
+template
+void ker_arrange_qkv_with_cache_i8I_i8O_launcher(
+ int batch_token_num, int hidden_size, cudaStream_t stream,
+ const int8_t* ori_qkv, const T* qkv_bias, int8_t* new_q, int8_t* new_k,
+ int8_t* k_cache, int8_t* new_v, int8_t* v_cache, int batch_seq_len,
+ int dim_per_head, int head_num, float dequant_scale, float quant_scale,
+ bool in_col32 = false);
+
+template
+void ker_attention_mask_weights_i32I_launcher(
+ int batch_size, int dst_seq_len, int src_seq_len, int head_num,
+ cudaStream_t stream, int32_t* correlation, T* output,
+ const int* real_seq_len, float attn_scale, float dequant_scale);
+
+} // namespace cuda
+} // namespace lightseq
diff --git a/lightseq/inference/kernels/transformerKernels.cc.cu b/lightseq/inference/kernels/transformerKernels.cc.cu
index c8794312..05a22094 100644
--- a/lightseq/inference/kernels/transformerKernels.cc.cu
+++ b/lightseq/inference/kernels/transformerKernels.cc.cu
@@ -810,7 +810,7 @@ __global__ void ker_arrange_decself_qkv(const T* ori_qkv, const T* qkv_bias,
T val = ori_qkv[(blockIdx.x * gridDim.y + blockIdx.y) * hidden_size + i] +
__ldg(&qkv_bias[blockIdx.y * hidden_size + i]);
int seq_id =
- blockIdx.x; // obvious, seq_id = batch_id * beam_size + beam_id
+ blockIdx.x; // obvious, seq_id = batch_id * beam_size + beam_id
if (blockIdx.y == 0) {
// for query
new_q[seq_id * hidden_size + i] = val;
@@ -841,7 +841,7 @@ __global__ void ker_arrange_decself_qkv<__half>(
half2 val = __hadd2(
p_qkv[(blockIdx.x * gridDim.y + blockIdx.y) * half_hidden_size + i],
__ldg(&p_bias[blockIdx.y * half_hidden_size + i]));
- // obvious,seq_id = batch_id * beam_size + beam_id
+ // obvious, seq_id = batch_id * beam_size + beam_id
int seq_id = blockIdx.x;
if (blockIdx.y == 0) {
// for query
diff --git a/lightseq/inference/kernels/transformerKernels_int8.cc.cu b/lightseq/inference/kernels/transformerKernels_int8.cc.cu
index 67406048..85c5d736 100644
--- a/lightseq/inference/kernels/transformerKernels_int8.cc.cu
+++ b/lightseq/inference/kernels/transformerKernels_int8.cc.cu
@@ -864,85 +864,72 @@ template void ker_residual_bias_ln_i32I_launcher(
template
__global__ void ker_bias_gelu_i8I_i8O(int8_t *input, int8_t *output,
- const T *bias, int total_count,
- int feature_dim, float dequant_scale,
- float quant_scale, bool in_out_col32) {
- int i = blockIdx.x * blockDim.x + threadIdx.x;
-
- if (i * 4 >= total_count) return;
+ const T *bias, int feature_dim,
+ float dequant_scale, float quant_scale,
+ bool in_col32, bool out_col32) {
+ int block_start = blockIdx.x * feature_dim;
+ int start = block_start + threadIdx.x;
+ int end = block_start + feature_dim;
+ for (int i = start; i < end; i += blockDim.x) {
+ int input_index;
+ if (in_col32) {
+ int row_id = blockIdx.x;
+ int col_id = i - block_start;
+ input_index =
+ row_major2flat_col32(row_id, col_id, gridDim.x, feature_dim);
+ } else {
+ input_index = i;
+ }
- char4 *out4 = reinterpret_cast(output);
- const char4 *data4 = reinterpret_cast(input);
- const float4 *bias4 = reinterpret_cast(bias);
+ float fout = gelu(float(input[input_index]) * dequant_scale +
+ __ldg(&bias[i - block_start]));
- int bias_i;
- if (in_out_col32) {
- int row_size = total_count / feature_dim;
- int flat_i = i << 2;
- int col_id = (flat_i / (row_size * 32)) * 32 + (flat_i & 31);
- bias_i = col_id >> 2;
- } else {
- bias_i = i % (feature_dim >> 2);
+ int output_index;
+ if (out_col32) {
+ int row_id = blockIdx.x;
+ int col_id = i - block_start;
+ output_index =
+ row_major2flat_col32(row_id, col_id, gridDim.x, feature_dim);
+ } else {
+ output_index = i;
+ }
+ output[output_index] = float2int8(fout, quant_scale);
}
-
- const char4 input4 = data4[i];
- const float4 b4 = __ldg(&bias4[bias_i]);
- float4 output4;
-
- output4.x = gelu(float(input4.x) * dequant_scale + b4.x);
- output4.y = gelu(float(input4.y) * dequant_scale + b4.y);
- output4.z = gelu(float(input4.z) * dequant_scale + b4.z);
- output4.w = gelu(float(input4.w) * dequant_scale + b4.w);
-
- char4 out_i4;
- out_i4.x = float2int8(output4.x, quant_scale);
- out_i4.y = float2int8(output4.y, quant_scale);
- out_i4.z = float2int8(output4.z, quant_scale);
- out_i4.w = float2int8(output4.w, quant_scale);
- out4[i] = out_i4;
}
/* fp16 version */
template <>
-__global__ void ker_bias_gelu_i8I_i8O<__half>(int8_t *input, int8_t *output,
- const __half *bias,
- int total_count, int feature_dim,
- float dequant_scale,
- float quant_scale,
- bool in_out_col32) {
- int i = blockIdx.x * blockDim.x + threadIdx.x;
-
- if (i * 8 >= total_count) return;
-
- const int2 *vals_int2 = reinterpret_cast(input);
- int64_t *outs_i8 = reinterpret_cast(output);
- const float4 *bias4 = reinterpret_cast(bias);
-
- int bias_i;
- if (in_out_col32) {
- int row_size = total_count / feature_dim;
- int flat_i = i << 3;
- int col_id = (flat_i / (row_size * 32)) * 32 + (flat_i & 31);
- bias_i = col_id >> 3;
- } else {
- bias_i = i % (feature_dim >> 3);
- }
+__global__ void ker_bias_gelu_i8I_i8O<__half>(
+ int8_t *input, int8_t *output, const __half *bias, int feature_dim,
+ float dequant_scale, float quant_scale, bool in_col32, bool out_col32) {
+ int block_start = blockIdx.x * feature_dim;
+ int start = block_start + threadIdx.x;
+ int end = block_start + feature_dim;
+ for (int i = start; i < end; i += blockDim.x) {
+ int input_index;
+ if (in_col32) {
+ int row_id = blockIdx.x;
+ int col_id = i - block_start;
+ input_index =
+ row_major2flat_col32(row_id, col_id, gridDim.x, feature_dim);
+ } else {
+ input_index = i;
+ }
- int2 val_int2 = vals_int2[i];
- int8_t *val1 = reinterpret_cast(&val_int2);
- const float4 b4 = __ldg(&bias4[bias_i]);
- const __half *b_half = reinterpret_cast(&b4);
- int64_t out_i8;
- int8_t *out_i1 = reinterpret_cast(&out_i8);
+ float fout = gelu(float(input[input_index]) * dequant_scale +
+ __half2float(__ldg(&bias[i - block_start])));
-#pragma unroll
- for (int j = 0; j < 8; ++j) {
- float out_f;
- out_f =
- gelu(float(val1[j]) * dequant_scale + __half2float(b_half[j]));
- out_i1[j] = float2int8(out_f, quant_scale);
+ int output_index;
+ if (out_col32) {
+ int row_id = blockIdx.x;
+ int col_id = i - block_start;
+ output_index =
+ row_major2flat_col32(row_id, col_id, gridDim.x, feature_dim);
+ } else {
+ output_index = i;
+ }
+ output[output_index] = float2int8(fout, quant_scale);
}
- outs_i8[i] = out_i8;
}
template
@@ -950,35 +937,31 @@ void ker_bias_gelu_i8I_i8O_launcher(int batch_token_num, cudaStream_t stream,
int8_t *input, int8_t *output,
const T *bias, int feature_dim,
float dequant_scale, float quant_scale,
- bool in_out_col32) {
- int total_count = batch_token_num * feature_dim;
- int grid_dim = total_count >> 10;
- ker_bias_gelu_i8I_i8O<<>>(
- input, output, bias, total_count, feature_dim, dequant_scale, quant_scale,
- in_out_col32);
+ bool in_col32, bool out_col32) {
+ ker_bias_gelu_i8I_i8O<<>>(
+ input, output, bias, feature_dim, dequant_scale, quant_scale, in_col32,
+ out_col32);
}
template <>
void ker_bias_gelu_i8I_i8O_launcher<__half>(
int batch_token_num, cudaStream_t stream, int8_t *input, int8_t *output,
const __half *bias, int feature_dim, float dequant_scale, float quant_scale,
- bool in_out_col32) {
- int total_count = batch_token_num * feature_dim;
- int grid_dim = total_count >> 11;
- ker_bias_gelu_i8I_i8O<__half><<>>(
- input, output, bias, total_count, feature_dim, dequant_scale, quant_scale,
- in_out_col32);
+ bool in_col32, bool out_col32) {
+ ker_bias_gelu_i8I_i8O<__half><<>>(
+ input, output, bias, feature_dim, dequant_scale, quant_scale, in_col32,
+ out_col32);
}
template void ker_bias_gelu_i8I_i8O_launcher(
int batch_token_num, cudaStream_t stream, int8_t *input, int8_t *output,
const float *bias, int feature_dim, float dequant_scale, float quant_scale,
- bool in_out_col32);
+ bool in_col32, bool out_col32);
template void ker_bias_gelu_i8I_i8O_launcher<__half>(
int batch_token_num, cudaStream_t stream, int8_t *input, int8_t *output,
const __half *bias, int feature_dim, float dequant_scale, float quant_scale,
- bool in_out_col32);
+ bool in_col32, bool out_col32);
template
__global__ void ker_bias_relu_i8I_i8O(int8_t *input, int8_t *output,
@@ -1199,6 +1182,122 @@ template void ker_arrange_encself_qkv_i8I_launcher<__half>(
int max_batch_dim, int batch_seq_len, int dim_per_head, int head_num,
int max_thread_per_block, float dequant_scale, bool in_col32);
+template
+__global__ void ker_arrange_encself_qkv_i8I_i8O(
+ const int8_t *ori_qkv, const T *qkv_bias, int8_t *new_q, int8_t *new_k,
+ int8_t *new_v, T *d_v, int batch_seq_len, int dim_per_head, int head_num,
+ float dequant_scale, float quant_scale, bool in_col32) {
+ int hidden_size = dim_per_head * head_num;
+ int batch_id = blockIdx.x / batch_seq_len;
+ int token_id = blockIdx.x % batch_seq_len;
+ for (std::size_t i = threadIdx.x; i < hidden_size; i += blockDim.x) {
+ int head_id = i / dim_per_head;
+ int dim_id = i % dim_per_head;
+ int target_id = targetid_4dim(batch_id, head_id, token_id, dim_id, head_num,
+ batch_seq_len, dim_per_head);
+ int qkv_index;
+ if (in_col32) {
+ int row_id = blockIdx.x;
+ int col_id = blockIdx.y * hidden_size + i;
+ qkv_index = row_major2flat_col32(row_id, col_id, gridDim.x,
+ gridDim.y * hidden_size);
+ } else {
+ qkv_index = (blockIdx.x * gridDim.y + blockIdx.y) * hidden_size + i;
+ }
+
+ float val = float(ori_qkv[qkv_index]) * dequant_scale +
+ __ldg(&qkv_bias[blockIdx.y * hidden_size + i]);
+ int8_t quant_val = float2int8(val, quant_scale);
+
+ if (blockIdx.y == 0) {
+ new_q[target_id] = quant_val;
+ } else if (blockIdx.y == 1) {
+ new_k[target_id] = quant_val;
+ } else {
+ new_v[target_id] = quant_val;
+ d_v[target_id] = float(quant_val) / quant_scale;
+ }
+ }
+}
+
+template <>
+__global__ void ker_arrange_encself_qkv_i8I_i8O<__half>(
+ const int8_t *ori_qkv, const __half *qkv_bias, int8_t *new_q, int8_t *new_k,
+ int8_t *new_v, __half *d_v, int batch_seq_len, int dim_per_head,
+ int head_num, float dequant_scale, float quant_scale, bool in_col32) {
+ int hidden_size = dim_per_head * head_num;
+ int batch_id = blockIdx.x / batch_seq_len;
+ int token_id = blockIdx.x % batch_seq_len;
+ for (std::size_t i = threadIdx.x; i < hidden_size; i += blockDim.x) {
+ int head_id = i / dim_per_head;
+ int dim_id = i % dim_per_head;
+ int target_id = targetid_4dim(batch_id, head_id, token_id, dim_id, head_num,
+ batch_seq_len, dim_per_head);
+ int qkv_index;
+ if (in_col32) {
+ int row_id = blockIdx.x;
+ int col_id = blockIdx.y * hidden_size + i;
+ qkv_index = row_major2flat_col32(row_id, col_id, gridDim.x,
+ gridDim.y * hidden_size);
+ } else {
+ qkv_index = (blockIdx.x * gridDim.y + blockIdx.y) * hidden_size + i;
+ }
+
+ float val = float(ori_qkv[qkv_index]) * dequant_scale +
+ __half2float(__ldg(&qkv_bias[blockIdx.y * hidden_size + i]));
+ int8_t quant_val = float2int8(val, quant_scale);
+
+ if (blockIdx.y == 0) {
+ new_q[target_id] = quant_val;
+ } else if (blockIdx.y == 1) {
+ new_k[target_id] = quant_val;
+ } else {
+ new_v[target_id] = quant_val;
+ d_v[target_id] = __float2half(float(quant_val) / quant_scale);
+ }
+ }
+}
+
+template
+void ker_arrange_encself_qkv_i8I_i8O_launcher(
+ int batch_token_num, int hidden_size, cudaStream_t stream,
+ const int8_t *ori_qkv, const T *qkv_bias, int8_t *new_q, int8_t *new_k,
+ int8_t *new_v, T *d_v, int batch_seq_len, int dim_per_head, int head_num,
+ int max_thread_per_block, float dequant_scale, float quant_scale,
+ bool in_col32) {
+ ker_arrange_encself_qkv_i8I_i8O
+ <<>>(
+ ori_qkv, qkv_bias, new_q, new_k, new_v, d_v, batch_seq_len,
+ dim_per_head, head_num, dequant_scale, quant_scale, in_col32);
+}
+
+template <>
+void ker_arrange_encself_qkv_i8I_i8O_launcher<__half>(
+ int batch_token_num, int hidden_size, cudaStream_t stream,
+ const int8_t *ori_qkv, const __half *qkv_bias, int8_t *new_q, int8_t *new_k,
+ int8_t *new_v, __half *d_v, int batch_seq_len, int dim_per_head,
+ int head_num, int max_thread_per_block, float dequant_scale,
+ float quant_scale, bool in_col32) {
+ ker_arrange_encself_qkv_i8I_i8O<__half>
+ <<>>(
+ ori_qkv, qkv_bias, new_q, new_k, new_v, d_v, batch_seq_len,
+ dim_per_head, head_num, dequant_scale, quant_scale, in_col32);
+}
+
+template void ker_arrange_encself_qkv_i8I_i8O_launcher(
+ int batch_token_num, int hidden_size, cudaStream_t stream,
+ const int8_t *ori_qkv, const float *qkv_bias, int8_t *new_q, int8_t *new_k,
+ int8_t *new_v, float *d_v, int batch_seq_len, int dim_per_head,
+ int head_num, int max_thread_per_block, float dequant_scale,
+ float quant_scale, bool in_col32);
+
+template void ker_arrange_encself_qkv_i8I_i8O_launcher<__half>(
+ int batch_token_num, int hidden_size, cudaStream_t stream,
+ const int8_t *ori_qkv, const __half *qkv_bias, int8_t *new_q, int8_t *new_k,
+ int8_t *new_v, __half *d_v, int batch_seq_len, int dim_per_head,
+ int head_num, int max_thread_per_block, float dequant_scale,
+ float quant_scale, bool in_col32);
+
template
__global__ void ker_arrange_atten_output_i8O(const T *ori_q, int8_t *new_q,
int beam_size, int dim_per_head,
@@ -1294,7 +1393,7 @@ template void ker_arrange_atten_output_i8O_launcher<__half>(
int head_num, int max_thread_per_block, float quant_scale, bool out_col32);
template
-__global__ void ker_arrange_decself_qkv_i8I(
+__global__ void ker_arrange_decself_qkv_i8I_i8O(
const int8_t *ori_qkv, const T *qkv_bias, int8_t *new_q, int8_t *new_k,
int8_t *new_v, int head_num, int dim_per_head, int max_step, int step_id,
float dequant_scale, float quant_scale, bool in_col32) {
@@ -1313,7 +1412,7 @@ __global__ void ker_arrange_decself_qkv_i8I(
__ldg(&qkv_bias[blockIdx.y * hidden_size + i]);
int8_t quant_val = float2int8(val, quant_scale);
int seq_id =
- blockIdx.x; // obvious, seq_id = batch_id * beam_size + beam_id
+ blockIdx.x; // obvious, seq_id = batch_id * beam_size + beam_id
if (blockIdx.y == 0) {
// for query
new_q[seq_id * hidden_size + i] = quant_val;
@@ -1334,7 +1433,7 @@ __global__ void ker_arrange_decself_qkv_i8I(
}
template <>
-__global__ void ker_arrange_decself_qkv_i8I<__half>(
+__global__ void ker_arrange_decself_qkv_i8I_i8O<__half>(
const int8_t *ori_qkv, const __half *qkv_bias, int8_t *new_q, int8_t *new_k,
int8_t *new_v, int head_num, int dim_per_head, int max_step, int step_id,
float dequant_scale, float quant_scale, bool in_col32) {
@@ -1353,7 +1452,7 @@ __global__ void ker_arrange_decself_qkv_i8I<__half>(
__half2float(__ldg(&qkv_bias[blockIdx.y * hidden_size + i]));
int8_t quant_val = float2int8(val, quant_scale);
int seq_id =
- blockIdx.x; // obvious, seq_id = batch_id * beam_size + beam_id
+ blockIdx.x; // obvious, seq_id = batch_id * beam_size + beam_id
if (blockIdx.y == 0) {
// for query
new_q[seq_id * hidden_size + i] = quant_val;
@@ -1374,39 +1473,39 @@ __global__ void ker_arrange_decself_qkv_i8I<__half>(
}
template
-void ker_arrange_decself_qkv_i8I_launcher(
+void ker_arrange_decself_qkv_i8I_i8O_launcher(
int step_token_num, int hidden_size, cudaStream_t stream,
const int8_t *ori_qkv, const T *qkv_bias, int8_t *new_q, int8_t *new_k,
int8_t *new_v, int head_num, int dim_per_head, int max_step, int step_id,
int max_thread_per_block, float dequant_scale, float quant_scale,
bool in_col32) {
- ker_arrange_decself_qkv_i8I
+ ker_arrange_decself_qkv_i8I_i8O
<<>>(
ori_qkv, qkv_bias, new_q, new_k, new_v, head_num, dim_per_head,
max_step, step_id, dequant_scale, quant_scale, in_col32);
}
// template <>
-// void ker_arrange_decself_qkv_i8I_launcher<__half>(
+// void ker_arrange_decself_qkv_i8I_i8O_launcher<__half>(
// int step_token_num, int hidden_size, cudaStream_t stream,
// const int8_t *ori_qkv, const __half *qkv_bias, int8_t *new_q, int8_t
// *new_k, int8_t *new_v, int head_num, int dim_per_head, int max_step, int
// step_id, int max_thread_per_block, float dequant_scale, float
// quant_scale, bool in_col32) {
-// ker_arrange_decself_qkv_i8I<__half>
+// ker_arrange_decself_qkv_i8I_i8O<__half>
// <<>>(
// ori_qkv, qkv_bias, new_q, new_k, new_v, head_num, dim_per_head,
// max_step, step_id, dequant_scale, quant_scale, in_col32);
// }
-template void ker_arrange_decself_qkv_i8I_launcher(
+template void ker_arrange_decself_qkv_i8I_i8O_launcher(
int step_token_num, int hidden_size, cudaStream_t stream,
const int8_t *ori_qkv, const float *qkv_bias, int8_t *new_q, int8_t *new_k,
int8_t *new_v, int head_num, int dim_per_head, int max_step, int step_id,
int max_thread_per_block, float dequant_scale, float quant_scale,
bool in_col32);
-template void ker_arrange_decself_qkv_i8I_launcher<__half>(
+template void ker_arrange_decself_qkv_i8I_i8O_launcher<__half>(
int step_token_num, int hidden_size, cudaStream_t stream,
const int8_t *ori_qkv, const __half *qkv_bias, int8_t *new_q, int8_t *new_k,
int8_t *new_v, int head_num, int dim_per_head, int max_step, int step_id,
@@ -1414,7 +1513,7 @@ template void ker_arrange_decself_qkv_i8I_launcher<__half>(
bool in_col32);
/**
-@brief: ker_fuse_softmax_new_value_int8
+@brief: ker_fuse_softmax_new_value_i32I_i8O
fused query-key correlation softmax and new_value for decoder self attention
@thread
@@ -1424,10 +1523,10 @@ blockDim.x = first multiple of WARP_SIZE greater than cur_step + 1
@param
correlation: [batch_size, beam_size, head_num, cur_step + 1]
*/
-__global__ void ker_fuse_softmax_new_value_int8(
+__global__ void ker_fuse_softmax_new_value_i32I_i8O(
const int32_t *logits, const int8_t *v, int8_t *new_v, int step_num,
int max_step, int head_num, int dim_per_head, float attn_scale,
- float dequant_scale, float quant_scale, bool col32_out) {
+ float dequant_scale, float quant_scale, bool out_col32) {
int idx = blockIdx.x * max_step + threadIdx.x;
float val = threadIdx.x < step_num ? float(logits[idx]) * dequant_scale *
dequant_scale * attn_scale
@@ -1470,28 +1569,28 @@ __global__ void ker_fuse_softmax_new_value_int8(
int col = head_idx * dim_per_head + i;
int col_size = head_num * dim_per_head;
int new_v_idx = row * col_size + col;
- if (col32_out) {
+ if (out_col32) {
new_v_idx = row_major2flat_col32(row, col, row_size, col_size);
}
new_v[new_v_idx] = float2int8(block_new_value[i], quant_scale);
}
}
-void ker_fuse_softmax_new_value_int8_launcher(
+void ker_fuse_softmax_new_value_i32I_i8O_launcher(
const int32_t *correlation, const int8_t *v, int8_t *new_v,
int batch_head_num, int step_num, int max_step, int head_num,
int dim_per_head, float attn_scale, float dequant_scale, float quant_scale,
- bool col32_out, cudaStream_t stream) {
+ bool out_col32, cudaStream_t stream) {
int block_dim = step_num;
if (step_num < 1024) {
block_dim = (step_num + 31) >> 5;
block_dim *= 32;
}
- ker_fuse_softmax_new_value_int8<<<
+ ker_fuse_softmax_new_value_i32I_i8O<<<
batch_head_num, block_dim,
dim_per_head * sizeof(float) + step_num * sizeof(float), stream>>>(
correlation, v, new_v, step_num, max_step, head_num, dim_per_head,
- attn_scale, dequant_scale, quant_scale, col32_out);
+ attn_scale, dequant_scale, quant_scale, out_col32);
}
template
@@ -1806,5 +1905,420 @@ template void select_beam_rough_topk_i8I_launcher<__half>(
int max_thread_per_block, cudaStream_t stream, int beam_size,
float diverse_lambda, int end_id, bool in_col32);
+template
+__global__ void ker_topk_sample_i8I(const int8_t *logits, const T *logit_bias,
+ int *old_input_ids, int *new_input_ids,
+ const int vocab_size, const int max_step,
+ const int batch_seq_len, int logits_seq_len,
+ int *unfinished, curandState *curandstate,
+ int eos_id, float dequant_scale,
+ bool in_col32) {
+ int last_token_idx_in_batch = blockIdx.x * max_step + batch_seq_len - 1;
+
+ /* add EOS to end if last token is EOS */
+ if (batch_seq_len > 1 && old_input_ids[last_token_idx_in_batch] == eos_id) {
+ if (threadIdx.x == 0) {
+ old_input_ids[last_token_idx_in_batch + 1] = eos_id;
+ }
+ return;
+ }
+ int logits_token_idx_in_batch =
+ blockIdx.x * logits_seq_len + logits_seq_len - 1;
+ int left_logit_idx = logits_token_idx_in_batch * vocab_size + threadIdx.x;
+ int right_logit_idx = (logits_token_idx_in_batch + 1) * vocab_size;
+
+ /*
+ step1. find max logit and rough Kth logit over the whole vocab
+ */
+ __shared__ float s_max_logit, s_topk_logit;
+ float rough_top_kth_logit = CUDA_FLOAT_INF_NEG;
+ for (int idx = left_logit_idx; idx < right_logit_idx; idx += blockDim.x) {
+ int logits_idx;
+ if (in_col32) {
+ int row_id = logits_token_idx_in_batch;
+ int col_id = idx - logits_token_idx_in_batch * vocab_size;
+ logits_idx = row_major2flat_col32(row_id, col_id,
+ gridDim.x * logits_seq_len, vocab_size);
+ } else {
+ logits_idx = idx;
+ }
+ rough_top_kth_logit = fmaxf(
+ rough_top_kth_logit,
+ (float)(logits[logits_idx]) * dequant_scale +
+ (float)__ldg(&logit_bias[idx - left_logit_idx + threadIdx.x]));
+ }
+ float max_logit = blockReduceMax(rough_top_kth_logit);
+ rough_top_kth_logit = blockRoughTopK(rough_top_kth_logit);
+ if (threadIdx.x == 0) {
+ s_topk_logit = rough_top_kth_logit;
+ s_max_logit = max_logit;
+ }
+ __syncthreads();
+
+ __shared__ int s_tid;
+
+ if (k != 1) {
+ /* step2 hold one logit per thread which larger than Kth logit and sample
+ * from them */
+ float topk_exp_sum, topk_exp = CUDA_FLOAT_INF_NEG;
+ int topk_tid = vocab_size;
+ // int test_num = 0;
+ __shared__ float s_topk_exp_sum;
+ for (int idx = left_logit_idx; idx < right_logit_idx; idx += blockDim.x) {
+ int logits_idx;
+ if (in_col32) {
+ int row_id = logits_token_idx_in_batch;
+ int col_id = idx - logits_token_idx_in_batch * vocab_size;
+ logits_idx = row_major2flat_col32(
+ row_id, col_id, gridDim.x * logits_seq_len, vocab_size);
+ } else {
+ logits_idx = idx;
+ }
+ float logit =
+ (float)logits[logits_idx] * dequant_scale +
+ (float)__ldg(&logit_bias[idx - left_logit_idx + threadIdx.x]);
+ float logit_exp = expf(fmaxf(logit - s_max_logit, logit_thresh_min));
+ // if (logit >= s_topk_logit) test_num++;
+ if (logit >= s_topk_logit && logit_exp > topk_exp) {
+ topk_exp = logit_exp;
+ topk_tid = idx - left_logit_idx + threadIdx.x;
+ }
+ }
+
+ // test_num = blockReduceSum(test_num);
+ // __shared__ int s_test_num;
+ // if (threadIdx.x == 0) {
+ // s_test_num = test_num;
+ // if (s_test_num != 1) printf("sample from top %d\n", s_test_num);
+ // // printf("sample from top %s", test_num);
+ // }
+ // __syncthreads();
+
+ if (topk_tid == vocab_size) topk_exp = 0;
+ topk_exp_sum = blockReduceSum(topk_exp);
+ if (threadIdx.x == 0) {
+ s_topk_exp_sum = topk_exp_sum;
+ }
+ __syncthreads();
+
+ /* calculate cumulative probability */
+ float topk_prob = topk_exp / s_topk_exp_sum;
+ float prefix_sum_prob;
+ typedef cub::BlockScan BlockScan;
+ __shared__ typename BlockScan::TempStorage temp_storage;
+ BlockScan(temp_storage).InclusiveSum(topk_prob, prefix_sum_prob);
+
+ __shared__ float random_x;
+ if (threadIdx.x == 0) {
+ random_x = curand_uniform(curandstate + blockIdx.x);
+ }
+ __syncthreads();
+
+ if (threadIdx.x == 0) {
+ s_tid = vocab_size;
+ }
+ __syncthreads();
+
+ int threadID = threadIdx.x;
+ __shared__ int s_threadID;
+ __shared__ float s_max_prob;
+ if (random_x > prefix_sum_prob) threadID = blockDim.x;
+ threadID = blockReduceMin(threadID);
+ float max_prob = blockReduceMax(topk_prob);
+ if (threadIdx.x == 0) {
+ s_threadID = threadID;
+ s_max_prob = max_prob;
+ }
+ __syncthreads();
+ if (threadIdx.x == s_threadID) {
+ s_tid = topk_tid;
+ }
+ __syncthreads();
+
+ if (s_tid == vocab_size && topk_prob == s_max_prob) {
+ s_tid = topk_tid;
+ }
+ __syncthreads();
+ } else {
+ s_tid = vocab_size;
+ for (int idx = left_logit_idx; idx < right_logit_idx; idx += blockDim.x) {
+ int logits_idx;
+ if (in_col32) {
+ int row_id = logits_token_idx_in_batch;
+ int col_id = idx - logits_token_idx_in_batch * vocab_size;
+ logits_idx = row_major2flat_col32(
+ row_id, col_id, gridDim.x * logits_seq_len, vocab_size);
+ } else {
+ logits_idx = idx;
+ }
+ float logit =
+ (float)logits[logits_idx] * dequant_scale +
+ (float)__ldg(&logit_bias[idx - left_logit_idx + threadIdx.x]);
+ if (logit == s_max_logit) {
+ s_tid = idx - left_logit_idx + threadIdx.x;
+ }
+ }
+ __syncthreads();
+ }
+
+ /* if new sampled tid is not EOS, set unfinish TRUE */
+ if (threadIdx.x == 0) {
+ if (s_tid != eos_id) unfinished[0] = 1;
+ }
+
+ /* step3 write back new sampled ids */
+ if (threadIdx.x == 0) {
+ old_input_ids[last_token_idx_in_batch + 1] = s_tid;
+ }
+}
+
+template
+void ker_topk_sample_i8I_launcher(
+ int batch_size, int batch_seq_len, const int max_step, int logits_seq_len,
+ int max_thread_per_block, cudaStream_t stream, const int8_t *logits,
+ const T *logit_bias, int *old_input_ids, int *new_input_ids,
+ const int vocab_size, const int k, int *unfinished,
+ curandState *curandstate, int eos_id, float dequant_scale, bool in_col32) {
+ if (k == 1)
+ ker_topk_sample_i8I<<>>(
+ logits, logit_bias, old_input_ids, new_input_ids, vocab_size, max_step,
+ batch_seq_len, logits_seq_len, unfinished, curandstate, eos_id,
+ dequant_scale, in_col32);
+ else if (k == 2)
+ ker_topk_sample_i8I<<>>(
+ logits, logit_bias, old_input_ids, new_input_ids, vocab_size, max_step,
+ batch_seq_len, logits_seq_len, unfinished, curandstate, eos_id,
+ dequant_scale, in_col32);
+ else if (k == 4)
+ ker_topk_sample_i8I<<>>(
+ logits, logit_bias, old_input_ids, new_input_ids, vocab_size, max_step,
+ batch_seq_len, logits_seq_len, unfinished, curandstate, eos_id,
+ dequant_scale, in_col32);
+ else if (k == 8)
+ ker_topk_sample_i8I<<>>(
+ logits, logit_bias, old_input_ids, new_input_ids, vocab_size, max_step,
+ batch_seq_len, logits_seq_len, unfinished, curandstate, eos_id,
+ dequant_scale, in_col32);
+ else if (k == 16)
+ ker_topk_sample_i8I<<>>(
+ logits, logit_bias, old_input_ids, new_input_ids, vocab_size, max_step,
+ batch_seq_len, logits_seq_len, unfinished, curandstate, eos_id,
+ dequant_scale, in_col32);
+ else if (k == 32)
+ ker_topk_sample_i8I<<>>(
+ logits, logit_bias, old_input_ids, new_input_ids, vocab_size, max_step,
+ batch_seq_len, logits_seq_len, unfinished, curandstate, eos_id,
+ dequant_scale, in_col32);
+ else {
+ throw std::invalid_argument("topk argument should be in [1,2,4,8,16,32]");
+ }
+}
+
+template void ker_topk_sample_i8I_launcher(
+ int batch_size, int batch_seq_len, const int max_step, int logits_seq_len,
+ int max_thread_per_block, cudaStream_t stream, const int8_t *logits,
+ const float *logit_bias, int *old_input_ids, int *new_input_idx,
+ const int vocab_size, const int k, int *unfinished,
+ curandState *curandstate, int eos_id, float dequant_scale, bool in_col32);
+
+template void ker_topk_sample_i8I_launcher<__half>(
+ int batch_size, int batch_seq_len, const int max_step, int logits_seq_len,
+ int max_thread_per_block, cudaStream_t stream, const int8_t *logits,
+ const __half *logit_bias, int *old_input_ids, int *new_input_idx,
+ const int vocab_size, const int k, int *unfinished,
+ curandState *curandstate, int eos_id, float dequant_scale, bool in_col32);
+
+template
+__global__ void ker_topp_sample_i8I(const int8_t *logits, const T *logit_bias,
+ int *old_input_ids, int *new_input_ids,
+ const int vocab_size, const int max_step,
+ const int batch_seq_len, int logits_seq_len,
+ int *unfinished, float p,
+ curandState *curandstate, int eos_id,
+ float dequant_scale, bool in_col32) {
+ int token_idx_in_batch = blockIdx.x * max_step + batch_seq_len - 1;
+
+ /* add EOS to end if last token is EOS */
+ if (batch_seq_len > 1 && old_input_ids[token_idx_in_batch] == eos_id) {
+ if (threadIdx.x == 0) {
+ old_input_ids[token_idx_in_batch + 1] = eos_id;
+ }
+ return;
+ }
+ int logits_token_idx_in_batch =
+ blockIdx.x * logits_seq_len + logits_seq_len - 1;
+ int left_logit_idx = logits_token_idx_in_batch * vocab_size + threadIdx.x;
+ int right_logit_idx = (logits_token_idx_in_batch + 1) * vocab_size;
+
+ /* step1. find max logit in each thread and sample from these probs with
+ * nucleus sampling */
+ __shared__ float s_max_logit;
+ float max_logit = CUDA_FLOAT_INF_NEG;
+ for (int idx = left_logit_idx; idx < right_logit_idx; idx += blockDim.x) {
+ int logits_idx;
+ if (in_col32) {
+ int row_id = logits_token_idx_in_batch;
+ int col_id = idx - logits_token_idx_in_batch * vocab_size;
+ logits_idx = row_major2flat_col32(row_id, col_id,
+ gridDim.x * logits_seq_len, vocab_size);
+ } else {
+ logits_idx = idx;
+ }
+ max_logit = fmaxf(max_logit, (float)logits[logits_idx] * dequant_scale) +
+ (float)__ldg(&logit_bias[idx - left_logit_idx + threadIdx.x]);
+ }
+ float max_logit_array[1];
+ max_logit_array[0] = max_logit;
+ typedef cub::BlockRadixSort BlockRadixSort;
+ __shared__ typename BlockRadixSort::TempStorage sort_temp_storage;
+ BlockRadixSort(sort_temp_storage).SortDescending(max_logit_array);
+ float presum_max_logit_exp;
+ max_logit = max_logit_array[0];
+
+ float block_max_logit = blockReduceMax(max_logit);
+ if (threadIdx.x == 0) {
+ s_max_logit = block_max_logit;
+ }
+ __syncthreads();
+
+ float biased_logit_exp =
+ expf(fmaxf(max_logit - s_max_logit, logit_thresh_min));
+
+ typedef cub::BlockScan BlockScan;
+ __shared__ typename BlockScan::TempStorage presum_temp_storage;
+ BlockScan(presum_temp_storage)
+ .InclusiveSum(biased_logit_exp, presum_max_logit_exp);
+
+ float topp_exp_threshold;
+ if (threadIdx.x == blockDim.x - 1) {
+ topp_exp_threshold = p * presum_max_logit_exp;
+ }
+ __shared__ float s_presum_logit_exp_threshold;
+ if (presum_max_logit_exp > topp_exp_threshold) {
+ presum_max_logit_exp = CUDA_FLOAT_INF_NEG;
+ }
+ float logit_exp_threshold = blockReduceMax(presum_max_logit_exp);
+ if (threadIdx.x == 0) {
+ s_presum_logit_exp_threshold = logit_exp_threshold;
+ }
+ __syncthreads();
+
+ __shared__ float s_logit_threshold;
+ if (presum_max_logit_exp == s_presum_logit_exp_threshold) {
+ s_logit_threshold = max_logit;
+ }
+ __syncthreads();
+
+ /* step2 hold one logit per thread which larger than Kth logit and sample
+ * from them */
+ float topk_exp_sum, topk_exp = CUDA_FLOAT_INF_NEG;
+ int topk_tid = vocab_size;
+ int test_num = 0;
+ __shared__ float s_topk_exp_sum;
+ for (int idx = left_logit_idx; idx < right_logit_idx; idx += blockDim.x) {
+ int logits_idx;
+ if (in_col32) {
+ int row_id = logits_token_idx_in_batch;
+ int col_id = idx - logits_token_idx_in_batch * vocab_size;
+ logits_idx = row_major2flat_col32(row_id, col_id,
+ gridDim.x * logits_seq_len, vocab_size);
+ } else {
+ logits_idx = idx;
+ }
+ float logit = (float)logits[logits_idx] * dequant_scale +
+ (float)__ldg(&logit_bias[idx - left_logit_idx + threadIdx.x]);
+ float logit_exp = expf(fmaxf(logit - s_max_logit, logit_thresh_min));
+ if (logit >= s_logit_threshold) test_num++;
+ if (logit >= s_logit_threshold && logit_exp > topk_exp) {
+ topk_exp = logit_exp;
+ topk_tid = idx - left_logit_idx + threadIdx.x;
+ }
+ }
+
+ test_num = blockReduceSum(test_num);
+
+ if (topk_tid == vocab_size) topk_exp = 0;
+ topk_exp_sum = blockReduceSum(topk_exp);
+ if (threadIdx.x == 0) {
+ s_topk_exp_sum = topk_exp_sum;
+ }
+ __syncthreads();
+
+ /* calculate cumulative probability */
+ float topk_prob = topk_exp / s_topk_exp_sum;
+ float prefix_sum_prob;
+ BlockScan(presum_temp_storage).InclusiveSum(topk_prob, prefix_sum_prob);
+
+ __shared__ float random_x;
+ if (threadIdx.x == 0) {
+ random_x = curand_uniform(curandstate + blockIdx.x);
+ }
+ __syncthreads();
+
+ __shared__ int s_tid;
+ if (threadIdx.x == 0) {
+ s_tid = vocab_size;
+ }
+ __syncthreads();
+
+ int threadID = threadIdx.x;
+ __shared__ int s_threadID;
+ __shared__ float s_max_prob;
+ if (random_x > prefix_sum_prob) threadID = blockDim.x;
+ threadID = blockReduceMin(threadID);
+ float max_prob = blockReduceMax(topk_prob);
+ if (threadIdx.x == 0) {
+ s_threadID = threadID;
+ s_max_prob = max_prob;
+ }
+ __syncthreads();
+ if (threadIdx.x == s_threadID) {
+ s_tid = topk_tid;
+ }
+ __syncthreads();
+
+ if (s_tid == vocab_size && topk_prob == s_max_prob) {
+ s_tid = topk_tid;
+ }
+ __syncthreads();
+
+ /* if new sampled tid is not EOS, set unfinish TRUE */
+ if (threadIdx.x == 0) {
+ if (s_tid != eos_id) unfinished[0] = 1;
+ }
+
+ /* step3 write back new sampled ids */
+ if (threadIdx.x == 0) {
+ old_input_ids[token_idx_in_batch + 1] = s_tid;
+ }
+}
+
+template
+void ker_topp_sample_i8I_launcher(
+ int batch_size, int batch_seq_len, const int max_step, int logits_seq_len,
+ int max_thread_per_block, cudaStream_t stream, const int8_t *logits,
+ const T *logit_bias, int *old_input_ids, int *new_input_ids,
+ const int vocab_size, const float p, int *unfinished,
+ curandState *curandstate, int eos_id, float dequant_scale, bool in_col32) {
+ ker_topp_sample_i8I<<>>(
+ logits, logit_bias, old_input_ids, new_input_ids, vocab_size, max_step,
+ batch_seq_len, logits_seq_len, unfinished, p, curandstate, eos_id,
+ dequant_scale, in_col32);
+}
+
+template void ker_topp_sample_i8I_launcher(
+ int batch_size, int batch_seq_len, const int max_step, int logits_seq_len,
+ int max_thread_per_block, cudaStream_t stream, const int8_t *logits,
+ const float *logit_bias, int *old_input_ids, int *new_input_idx,
+ const int vocab_size, const float p, int *unfinished,
+ curandState *curandstate, int eos_id, float dequant_scale, bool in_col32);
+
+template void ker_topp_sample_i8I_launcher<__half>(
+ int batch_size, int batch_seq_len, const int max_step, int logits_seq_len,
+ int max_thread_per_block, cudaStream_t stream, const int8_t *logits,
+ const __half *logit_bias, int *old_input_ids, int *new_input_idx,
+ const int vocab_size, const float p, int *unfinished,
+ curandState *curandstate, int eos_id, float dequant_scale, bool in_col32);
+
} // namespace cuda
} // namespace lightseq
diff --git a/lightseq/inference/kernels/transformerKernels_int8.h b/lightseq/inference/kernels/transformerKernels_int8.h
index 247943ed..cfe7690a 100644
--- a/lightseq/inference/kernels/transformerKernels_int8.h
+++ b/lightseq/inference/kernels/transformerKernels_int8.h
@@ -2,6 +2,7 @@
#include
#include
+#include
#include
namespace lightseq {
@@ -30,7 +31,8 @@ void ker_bias_gelu_i8I_i8O_launcher(int batch_token_num, cudaStream_t stream,
int8_t *input, int8_t *output,
const T *bias, int feature_dim,
float dequant_scale, float quant_scale,
- bool in_out_col32 = false);
+ bool in_col32 = false,
+ bool out_col32 = false);
// TODO: remove clip_max
template
@@ -38,8 +40,8 @@ void ker_bias_relu_i8I_i8O_launcher(int batch_token_num, cudaStream_t stream,
int8_t *input, int8_t *output,
const T *bias, int feature_dim,
float dequant_scale, float quant_scale,
- float clip_max, bool in_col32 = true,
- bool out_col32 = true,
+ float clip_max, bool in_col32 = false,
+ bool out_col32 = false,
bool narrow_clip = false);
template
@@ -47,16 +49,16 @@ void ker_residual_bias_ln_i32I_i8O_launcher(
const int32_t *input, const T *scale, const T *bias, const T *residual_bias,
int8_t *output, T *residual, int batch_tokens, int hidden_size,
float dequant_scale, float quant_scale, int max_thread_per_block,
- cudaStream_t stream, bool is_post_ln = false, bool in_col32 = true,
- bool out_col32 = true, const T *colsum = nullptr);
+ cudaStream_t stream, bool is_post_ln = false, bool in_col32 = false,
+ bool out_col32 = false, const T *colsum = nullptr);
template
void ker_residual_bias_ln_i8I_i8O_launcher(
const int8_t *input, const T *scale, const T *bias, const T *residual_bias,
int8_t *output, T *residual, int batch_tokens, int hidden_size,
float dequant_scale, float quant_scale, int max_thread_per_block,
- cudaStream_t stream, bool is_post_ln = false, bool in_col32 = true,
- bool out_col32 = true, const T *colsum = nullptr);
+ cudaStream_t stream, bool is_post_ln = false, bool in_col32 = false,
+ bool out_col32 = false, const T *colsum = nullptr);
template
void ker_residual_bias_ln_i32I_launcher(
@@ -72,6 +74,14 @@ void ker_arrange_encself_qkv_i8I_launcher(
int batch_seq_len, int dim_per_head, int head_num, int max_thread_per_block,
float dequant_scale, bool in_col32 = false);
+template
+void ker_arrange_encself_qkv_i8I_i8O_launcher(
+ int batch_token_num, int hidden_size, cudaStream_t stream,
+ const int8_t *ori_qkv, const T *qkv_bias, int8_t *new_q, int8_t *new_k,
+ int8_t *new_v, T *d_v, int batch_seq_len, int dim_per_head, int head_num,
+ int max_thread_per_block, float dequant_scale, float quant_scale,
+ bool in_col32 = false);
+
template
void ker_arrange_atten_output_i8O_launcher(
int batch_token_num, int hidden_size, cudaStream_t stream, const T *ori_q,
@@ -79,17 +89,17 @@ void ker_arrange_atten_output_i8O_launcher(
int max_thread_per_block, float quant_scale, bool out_col32 = false);
template
-void ker_arrange_decself_qkv_i8I_launcher(
+void ker_arrange_decself_qkv_i8I_i8O_launcher(
int step_token_num, int hidden_size, cudaStream_t stream,
const int8_t *ori_qkv, const T *qkv_bias, int8_t *new_q, int8_t *new_k,
int8_t *new_v, int head_num, int dim_per_head, int max_step, int step_id,
int max_thread_per_block, float dequant_scale, float quant_scale,
bool in_col32 = false);
-void ker_fuse_softmax_new_value_int8_launcher(
+void ker_fuse_softmax_new_value_i32I_i8O_launcher(
const int32_t *correlation, const int8_t *v, int8_t *new_v,
int batch_head_num, int step_num, int max_step, int head_num, int head_dim,
- float attn_scale, float dequant_scale, float quant_scale, bool col32_out,
+ float attn_scale, float dequant_scale, float quant_scale, bool out_col32,
cudaStream_t stream);
template
@@ -110,5 +120,27 @@ void select_beam_rough_topk_i8I_launcher(
int max_thread_per_block, cudaStream_t stream, int beam_size,
float diverse_lambda, int end_id, bool in_col32 = false);
+template
+void ker_topk_sample_i8I_launcher(int batch_size, int batch_seq_len,
+ const int max_step, int logits_seq_len,
+ int max_thread_per_block, cudaStream_t stream,
+ const int8_t *logits, const T *logit_bias,
+ int *old_input_ids, int *new_input_ids,
+ const int vocab_size, const int k,
+ int *all_finished, curandState *curandstate,
+ int eos_id, float dequant_scale,
+ bool in_col32 = false);
+
+template
+void ker_topp_sample_i8I_launcher(int batch_size, int batch_seq_len,
+ const int max_step, int logits_seq_len,
+ int max_thread_per_block, cudaStream_t stream,
+ const int8_t *logits, const T *logit_bias,
+ int *old_input_ids, int *new_input_ids,
+ const int vocab_size, const float p,
+ int *unfinished, curandState *curandstate,
+ int eos_id, float dequant_scale,
+ bool in_col32 = false);
+
} // namespace cuda
} // namespace lightseq
diff --git a/lightseq/inference/model/CMakeLists.txt b/lightseq/inference/model/CMakeLists.txt
index 16275320..c28ec64a 100644
--- a/lightseq/inference/model/CMakeLists.txt
+++ b/lightseq/inference/model/CMakeLists.txt
@@ -42,6 +42,18 @@ endif()
target_include_directories(gpt_model PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+add_library(quant_gpt_model STATIC quant_gpt_encoder.cc.cu)
+target_link_libraries(quant_gpt_model PUBLIC cuda_kernels)
+target_link_libraries(quant_gpt_model PUBLIC quant_gpt_weight)
+if(DYNAMIC_API)
+ target_link_libraries(quant_gpt_model PRIVATE CUDA::cublas CUDA::cublasLt)
+else()
+ target_link_libraries(quant_gpt_model PRIVATE CUDA::cublas_static
+ CUDA::cublasLt_static)
+endif()
+
+target_include_directories(quant_gpt_model PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+
add_library(bert_model STATIC bert_encoder.cc.cu)
target_link_libraries(bert_model PUBLIC cuda_kernels)
target_link_libraries(bert_model PUBLIC bert_weight)
@@ -52,6 +64,16 @@ else()
CUDA::cublasLt_static)
endif()
+add_library(quant_bert_model STATIC quant_bert_encoder.cc.cu)
+target_link_libraries(quant_bert_model PUBLIC cuda_kernels)
+target_link_libraries(quant_bert_model PUBLIC quant_bert_weight)
+if(DYNAMIC_API)
+ target_link_libraries(quant_bert_model PRIVATE CUDA::cublas CUDA::cublasLt)
+else()
+ target_link_libraries(quant_bert_model PRIVATE CUDA::cublas_static
+ CUDA::cublasLt_static)
+endif()
+
set(moe_files moe_decoder.cc.cu moe_encoder.cc.cu)
add_library(moe_model STATIC ${moe_files})
target_link_libraries(moe_model PUBLIC cuda_kernels)
diff --git a/lightseq/inference/model/encoder.h b/lightseq/inference/model/encoder.h
index b54bf6b7..fe204dcb 100644
--- a/lightseq/inference/model/encoder.h
+++ b/lightseq/inference/model/encoder.h
@@ -17,7 +17,7 @@
/**
@file
-Transformer decoder, composed by gemm lib and
+Transformer encoder, composed by gemm lib and
custom cuda kernel function
*/
diff --git a/lightseq/inference/model/gpt_encoder.h b/lightseq/inference/model/gpt_encoder.h
index 3ea74f6a..8ca2856f 100644
--- a/lightseq/inference/model/gpt_encoder.h
+++ b/lightseq/inference/model/gpt_encoder.h
@@ -53,7 +53,7 @@ class GptEncoder {
std::vector _h_sample_id;
int _h_unfinished;
- // gpu memeory buffer
+ // gpu memory buffer
_DataType *_p_d_query;
_DataType *_p_d_k_cache;
_DataType *_p_d_v_cache;
diff --git a/lightseq/inference/model/quant_bert_encoder.cc.cu b/lightseq/inference/model/quant_bert_encoder.cc.cu
new file mode 100644
index 00000000..c02b90ea
--- /dev/null
+++ b/lightseq/inference/model/quant_bert_encoder.cc.cu
@@ -0,0 +1,483 @@
+#include "quant_bert_encoder.h"
+#include "../kernels/embKernels_int8.h"
+#include "../kernels/transformerKernels.h"
+#include "../kernels/transformerKernels_int8.h"
+#include "cublas_helper.h"
+
+/**
+@file
+QuantBert encoder, composed by gemm lib and
+ custom cuda kernel function
+*/
+
+namespace lightseq {
+namespace cuda {
+
+template
+QuantBertEncoder::QuantBertEncoder(
+ int max_batch_size, const int *p_d_token_id, int *p_d_padding_mask,
+ _DataType *p_d_output, const QuantBertWeight &tw,
+ cudaStream_t stream, cublasHandle_t hd, const int *p_d_lang_id)
+ : _max_batch_size(max_batch_size),
+ _p_d_token_id(p_d_token_id),
+ _p_d_padding_mask(p_d_padding_mask),
+ _p_d_output(p_d_output),
+ _p_d_lang_id(p_d_lang_id),
+ _tw(tw),
+ _stream(stream),
+ _hd(hd),
+ _p_d_src_emb_wei(tw.get_src_emb_wei()),
+ _p_d_enc_wei(tw.get_enc_wei()),
+ _fone((_DataType)1.f),
+ _fzero((_DataType)0.f),
+ _src_emb_clip_max(tw.get_src_emb_clip_max()),
+ _enc_clip_max(tw.get_enc_clip_max()),
+ _ione((int32_t)1),
+ _izero((int32_t)0),
+ _atten_scaler((_DataType)sqrt(1.f / tw._dim_per_head)),
+ _max_batch_dim(max_batch_size * tw._max_step * tw._hidden_size),
+ _max_thread_per_block(1024) {
+ CHECK_GPU_ERROR(cublasLtCreate(&_cublas_lt_handle));
+}
+
+/**
+Init the GPU memory pointer which point to
+ the memory buffer needed by encoder.
+These buffer are used during custom cuda kernel function,
+ find the corresponding function to see how these buffer are used
+*/
+template
+void QuantBertEncoder::init_buffer() {
+ std::cout << "encoder buffer init start" << std::endl;
+
+ _DataType *qkv_buf;
+ CHECK_GPU_ERROR(cudaMalloc(&qkv_buf, 3 * _max_batch_dim * sizeof(_DataType)));
+ _p_d_q = qkv_buf;
+ _p_d_k = qkv_buf + _max_batch_dim;
+ _p_d_v = qkv_buf + 2 * _max_batch_dim;
+
+ CHECK_GPU_ERROR(cudaMalloc(&_p_d_c, _max_batch_size * _tw._head_num *
+ _tw._max_step * _tw._max_step *
+ sizeof(_DataType)));
+
+ int max_batch_dim = _max_batch_size * _tw._max_step *
+ std::max(_tw._inner_size, _tw._hidden_size * 3);
+ CHECK_GPU_ERROR(cudaMalloc(&_int8_ffn_in_buf, max_batch_dim));
+ CHECK_GPU_ERROR(
+ cudaMalloc(&_int32_ffn_out_buf, max_batch_dim * sizeof(int32_t)));
+ CHECK_GPU_ERROR(
+ cudaMalloc(&_int8_ffn_out_buf, max_batch_dim * sizeof(int8_t)));
+
+ CHECK_GPU_ERROR(
+ cudaMalloc(&_int8_p_d_src_emb_wei,
+ _tw._src_vocab_size * _tw._hidden_size * sizeof(int8_t)));
+ quantize_weight(_p_d_src_emb_wei[0], _int8_p_d_src_emb_wei,
+ _tw._src_vocab_size, _tw._hidden_size,
+ _quant_range / _src_emb_clip_max, _stream, _cublas_lt_handle,
+ kRowMajor);
+
+ _p_device_emb.push_back(nullptr);
+ _p_device_emb.push_back(
+ to_gpu(_p_d_src_emb_wei[1], _tw._max_step * _tw._hidden_size, _stream));
+ _p_device_emb.push_back(
+ to_gpu(_p_d_src_emb_wei[2], _tw._hidden_size, _stream));
+ _p_device_emb.push_back(
+ to_gpu(_p_d_src_emb_wei[3], _tw._hidden_size, _stream));
+ if (_tw._multilg_type != 0) {
+ _p_device_emb.push_back(
+ to_gpu(_p_d_src_emb_wei[4], _tw._hidden_size, _stream));
+ } else {
+ _p_device_emb.push_back(nullptr);
+ }
+
+ // prepare gpu memory for weight
+ _int8_p_d_enc_wei = std::vector(_tw._n_enc_layer * 4);
+ _scaled_ffn2_colsum = std::vector<_DataType *>(_tw._n_enc_layer);
+ for (_layer_id = 0; _layer_id < _tw._n_enc_layer; _layer_id++) {
+ _weight_offset = _layer_id * _tw._weight_per_enc_layer;
+ CHECK_GPU_ERROR(cudaMalloc(&_int8_p_d_enc_wei[_layer_id * 4],
+ _tw._hidden_size * 3 * _tw._hidden_size));
+ CHECK_GPU_ERROR(cudaMalloc(&_int8_p_d_enc_wei[_layer_id * 4 + 1],
+ _tw._hidden_size * _tw._hidden_size));
+ CHECK_GPU_ERROR(cudaMalloc(&_int8_p_d_enc_wei[_layer_id * 4 + 2],
+ _tw._hidden_size * _tw._inner_size));
+ CHECK_GPU_ERROR(cudaMalloc(&_int8_p_d_enc_wei[_layer_id * 4 + 3],
+ _tw._inner_size * _tw._hidden_size));
+
+ _p_device_wei.push_back(
+ to_gpu(_p_d_enc_wei[_weight_offset], _tw._hidden_size, _stream));
+ _p_device_wei.push_back(
+ to_gpu(_p_d_enc_wei[_weight_offset + 1], _tw._hidden_size, _stream));
+ _p_device_wei.push_back(nullptr);
+ _p_device_wei.push_back(to_gpu(_p_d_enc_wei[_weight_offset + 3],
+ _tw._hidden_size * 3, _stream));
+ _p_device_wei.push_back(nullptr);
+ _p_device_wei.push_back(
+ to_gpu(_p_d_enc_wei[_weight_offset + 5], _tw._hidden_size, _stream));
+ _p_device_wei.push_back(
+ to_gpu(_p_d_enc_wei[_weight_offset + 6], _tw._hidden_size, _stream));
+ _p_device_wei.push_back(
+ to_gpu(_p_d_enc_wei[_weight_offset + 7], _tw._hidden_size, _stream));
+ _p_device_wei.push_back(nullptr);
+ _p_device_wei.push_back(
+ to_gpu(_p_d_enc_wei[_weight_offset + 9], _tw._inner_size, _stream));
+ _p_device_wei.push_back(nullptr);
+ _p_device_wei.push_back(
+ to_gpu(_p_d_enc_wei[_weight_offset + 11], _tw._hidden_size, _stream));
+
+ quantize_weight(_p_d_enc_wei[_weight_offset + 2],
+ _int8_p_d_enc_wei[_layer_id * 4], _tw._hidden_size,
+ _tw._hidden_size * 3,
+ _quant_range / _enc_clip_max[_layer_id * 11], _stream,
+ _cublas_lt_handle);
+
+ quantize_weight(_p_d_enc_wei[_weight_offset + 4],
+ _int8_p_d_enc_wei[_layer_id * 4 + 1], _tw._hidden_size,
+ _tw._hidden_size,
+ _quant_range / _enc_clip_max[_layer_id * 11 + 1], _stream,
+ _cublas_lt_handle);
+
+ quantize_weight(_p_d_enc_wei[_weight_offset + 8],
+ _int8_p_d_enc_wei[_layer_id * 4 + 2], _tw._hidden_size,
+ _tw._inner_size,
+ _quant_range / _enc_clip_max[_layer_id * 11 + 2], _stream,
+ _cublas_lt_handle);
+
+ quantize_weight(_p_d_enc_wei[_weight_offset + 10],
+ _int8_p_d_enc_wei[_layer_id * 4 + 3], _tw._inner_size,
+ _tw._hidden_size,
+ _quant_range / _enc_clip_max[_layer_id * 11 + 3], _stream,
+ _cublas_lt_handle);
+
+ if (_tw._use_gelu) {
+ _scaled_ffn2_colsum[_layer_id] = nullptr;
+ } else {
+ CHECK_GPU_ERROR(cudaMalloc(&_scaled_ffn2_colsum[_layer_id],
+ _tw._hidden_size * sizeof(_DataType)));
+ float relu_scale = _enc_clip_max[_layer_id * 11 + 7] / 2;
+ _DataType *temp;
+ int weight_size = _tw._inner_size * _tw._hidden_size;
+
+ CHECK_GPU_ERROR(cudaMalloc(&temp, weight_size * sizeof(_DataType)));
+ CHECK_GPU_ERROR(cudaMemcpyAsync(temp, _p_d_enc_wei[_weight_offset + 10],
+ weight_size * sizeof(_DataType),
+ cudaMemcpyHostToDevice, _stream));
+ launch_scaled_colsum(temp, _scaled_ffn2_colsum[_layer_id],
+ _tw._inner_size, _tw._hidden_size, relu_scale,
+ _stream);
+
+ CHECK_GPU_ERROR(cudaGetLastError());
+ CHECK_GPU_ERROR(cudaFree(temp));
+ }
+ }
+ std::cout << "encoder buffer init succeed" << std::endl;
+ return;
+}
+
+/**
+Some requirements needed by custom cuda kernel function
+*/
+template
+std::string QuantBertEncoder::check() {
+ // if (_max_thread_per_block < _tw._hidden_size) {
+ // return "violate hidden_size <= max_thread_per_block";
+ // }
+ if (_tw._inner_size & 1) {
+ return "violate inner_size % 2 = 0";
+ }
+ if (_tw._dim_per_head & 1) {
+ return "violate dim_per_head % 2 = 0";
+ }
+ if (_tw._multilg_type == 0 && _p_d_src_emb_wei.size() != 4) {
+ return "violate p_d_src_emb_wei.size() = 4";
+ }
+ if (_tw._multilg_type != 0 && _p_d_src_emb_wei.size() != 5) {
+ return "violate p_d_src_emb_wei.size() = 5";
+ }
+ if (_p_d_enc_wei.size() != _tw._weight_per_enc_layer * _tw._n_enc_layer) {
+ return "violate p_d_enc_wei.size() = weight_per_enc_layer * n_enc_layer";
+ }
+ if (_tw._multilg_type != 0 && _p_d_lang_id == nullptr) {
+ return "lang id should not be null when multilg";
+ }
+ return "";
+}
+
+/**
+Encoder inference
+*/
+template
+void QuantBertEncoder::run_one_infer(int batch_size,
+ int batch_seq_len) {
+ if (batch_size > _max_batch_size) {
+ throw std::runtime_error("batch size of input greater than max_batch_size");
+ }
+ if (batch_seq_len > _tw._max_step) {
+ throw std::runtime_error("seq len of input greater than max_step");
+ }
+ /* ---step1. init--- */
+ _batch_size = batch_size;
+ _batch_seq_len = batch_seq_len;
+ _batch_token_num = batch_size * batch_seq_len;
+#ifdef DEBUG_RESULT
+ std::cout << "batch_size-" << batch_size << " batch_seq_len-" << batch_seq_len
+ << std::endl;
+ print_vec(_p_d_token_id, "batch_token_ids", batch_size * batch_seq_len);
+#endif
+
+ /* ---step2. encoder feedforward--- */
+ launch_enc_emb_i8I<_DataType>(
+ _int8_p_d_src_emb_wei, _p_device_emb[1], _p_d_token_id, _p_d_output,
+ _p_d_padding_mask, _tw._padding_id, batch_size, batch_seq_len,
+ _tw._hidden_size, _stream, _p_device_emb[4], _p_d_lang_id,
+ _tw._multilg_type, _src_emb_clip_max / _quant_range, false);
+#ifdef DEBUG_RESULT
+ for (int i = 0; i < _batch_size; i++) { // batch_id
+ for (int j = 0; j < _batch_seq_len; j++) { // token_id
+ std::cout << "emb out: token-" << j << std::endl;
+ print_vec(_p_d_output + i * _batch_seq_len * _tw._hidden_size +
+ j * _tw._hidden_size,
+ "emb out", 10);
+ }
+ } // not normal
+#endif
+ for (_layer_id = 0; _layer_id < _tw._n_enc_layer; _layer_id++) {
+ _weight_offset = _layer_id * _tw._weight_per_enc_layer;
+ self_attention();
+ ffn_add_norm();
+ }
+
+#ifdef DEBUG_RESULT
+ for (int i = 0; i < _batch_size; i++) { // batch_id
+ for (int j = 0; j < _batch_seq_len; j++) { // token_id
+ std::cout << "encoder output: token-" << j << std::endl;
+ print_vec(_p_d_output + i * _batch_seq_len * _tw._hidden_size +
+ j * _tw._hidden_size,
+ "encoder_output", _tw._dim_per_head);
+ }
+ } // not normal
+#endif
+ return;
+}
+
+/**
+Encoder self attention
+*/
+template
+void QuantBertEncoder::self_attention() {
+ /* ---step 0. layer_norm, add output_bias to "query"--- */
+ if (_layer_id == 0) {
+ ker_norm_layer_resual_i8O_launcher<_DataType>(
+ _batch_token_num, _tw._hidden_size, _stream, _p_d_output,
+ _int8_ffn_in_buf, _p_device_wei[_weight_offset],
+ _p_device_wei[_weight_offset + 1], _p_device_wei[_weight_offset + 5],
+ _max_thread_per_block, _quant_range / _enc_clip_max[_layer_id * 11 + 4],
+ _tw._is_post_ln, true);
+ }
+ CHECK_GPU_ERROR(cudaGetLastError());
+
+#ifdef DEBUG_RESULT
+ for (int i = 0; i < _batch_size; i++) { // batch_id
+ for (int j = 0; j < _batch_seq_len; j++) { // token_id
+ std::cout << "qkv_attn input: token-" << j << std::endl;
+ print_vec(_int8_ffn_in_buf + i * _batch_seq_len * _tw._hidden_size +
+ j * _tw._hidden_size,
+ "qkv_attn input", 10);
+ }
+ }
+#endif
+
+ /* ---step 1. qkv = ori_q * qkv_wei + bias, and reshape qkv for multi-head
+ * gemm--- */
+ cublasLtMM_withAlgo_i8IO(
+ _int8_ffn_out_buf, 1, _batch_token_num, _tw._hidden_size * 3,
+ _tw._hidden_size, 0, 0, 0,
+ _enc_clip_max[_layer_id * 11] * _enc_clip_max[_layer_id * 11 + 4] /
+ (_enc_clip_max[_layer_id * 11 + 8] * _quant_range),
+ _int8_ffn_in_buf, _int8_p_d_enc_wei[_layer_id * 4], _cublas_lt_handle,
+ _stream, false);
+
+ // get q, k, v by split and reshape qkv
+ ker_arrange_encself_qkv_i8I_launcher<_DataType>(
+ _batch_token_num, _tw._hidden_size, _stream, _int8_ffn_out_buf,
+ _p_device_wei[_weight_offset + 3], _p_d_q, _max_batch_dim, _batch_seq_len,
+ _tw._dim_per_head, _tw._head_num, _max_thread_per_block,
+ _enc_clip_max[_layer_id * 11 + 8] / _quant_range, true);
+
+ /* ---step 2. correlation = q * k, perform softmax on correlation--- */
+ CHECK_GPU_ERROR(cublasGemmStridedBatchedEx(
+ _hd, CUBLAS_OP_T, CUBLAS_OP_N, _batch_seq_len, _batch_seq_len,
+ _tw._dim_per_head, &_atten_scaler, _p_d_k, _AType, _tw._dim_per_head,
+ _batch_seq_len * _tw._dim_per_head, _p_d_q, _BType, _tw._dim_per_head,
+ _batch_seq_len * _tw._dim_per_head, &_fzero, _p_d_c, _CType,
+ _batch_seq_len, _batch_seq_len * _batch_seq_len,
+ _batch_size * _tw._head_num, _computeType,
+ CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+ ker_correlation_softmax_encself_launcher<_DataType>(
+ _batch_size, _batch_seq_len, _tw._head_num, _stream, _p_d_c,
+ _p_d_padding_mask);
+
+ /* ---step 3. new_q = correlation * v--- */
+ CHECK_GPU_ERROR(cublasGemmStridedBatchedEx(
+ _hd, CUBLAS_OP_N, CUBLAS_OP_N, _tw._dim_per_head, _batch_seq_len,
+ _batch_seq_len, &_fone, _p_d_v, _AType, _tw._dim_per_head,
+ _batch_seq_len * _tw._dim_per_head, _p_d_c, _BType, _batch_seq_len,
+ _batch_seq_len * _batch_seq_len, &_fzero, _p_d_q, _CType,
+ _tw._dim_per_head, _batch_seq_len * _tw._dim_per_head,
+ _batch_size * _tw._head_num, _computeType,
+ CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+ // use v to save reshaped q, since they are in same size and v
+ // will not be use again before the next multi-head-attention
+ ker_arrange_atten_output_i8O_launcher<_DataType>(
+ _batch_token_num, _tw._hidden_size, _stream, _p_d_q, _int8_ffn_in_buf,
+ _batch_seq_len, _tw._dim_per_head, _tw._head_num, _max_thread_per_block,
+ _quant_range / _enc_clip_max[_layer_id * 11 + 5], true);
+
+#ifdef DEBUG_RESULT
+ for (int i = 0; i < _batch_size; i++) { // batch_id
+ for (int j = 0; j < _batch_seq_len; j++) { // token_id
+ std::cout << "out_attn input: token-" << j << std::endl;
+ print_vec(_int8_ffn_in_buf + i * _batch_seq_len * _tw._hidden_size +
+ j * _tw._hidden_size,
+ "out_attn input", 10);
+ }
+ }
+#endif
+
+ /* ---step 4. new_q = ori_q + new_q * output_wei--- */
+ cublasLtMM_withAlgo_i8IO(
+ _int8_ffn_out_buf, 1, _batch_token_num, _tw._hidden_size,
+ _tw._hidden_size, 0, 0, 0,
+ _enc_clip_max[_layer_id * 11 + 1] * _enc_clip_max[_layer_id * 11 + 5] /
+ (_enc_clip_max[_layer_id * 11 + 9] * _quant_range),
+ _int8_ffn_in_buf, _int8_p_d_enc_wei[_layer_id * 4 + 1], _cublas_lt_handle,
+ _stream, false);
+
+#ifdef DEBUG_RESULT
+ for (int i = 0; i < _batch_size; i++) { // batch_id
+ for (int j = 0; j < _batch_seq_len; j++) { // token_id
+ std::cout << "attn_ln input: token-" << j << std::endl;
+ print_vec(_int8_ffn_out_buf + i * _batch_seq_len * _tw._hidden_size +
+ j * _tw._hidden_size,
+ "attn_ln input", 10);
+ }
+ }
+#endif
+
+ ker_residual_bias_ln_i8I_i8O_launcher<_DataType>(
+ _int8_ffn_out_buf, _p_device_wei[_weight_offset + 6],
+ _p_device_wei[_weight_offset + 7], _p_device_wei[_weight_offset + 11],
+ _int8_ffn_in_buf, _p_d_output, _batch_token_num, _tw._hidden_size,
+ _enc_clip_max[_layer_id * 11 + 9] / _quant_range,
+ _quant_range / _enc_clip_max[_layer_id * 11 + 6], _max_thread_per_block,
+ _stream, _tw._is_post_ln, true, true);
+
+ return;
+}
+
+template
+void QuantBertEncoder::ffn_add_norm() {
+#ifdef DEBUG_RESULT
+ for (int i = 0; i < _batch_size; i++) { // batch_id
+ for (int j = 0; j < _batch_seq_len; j++) { // token_id
+ std::cout << "ffn1 input: token-" << j << std::endl;
+ print_vec(_int8_ffn_in_buf + i * _batch_seq_len * _tw._hidden_size +
+ j * _tw._hidden_size,
+ "ffn1 input", 10);
+ }
+ }
+#endif
+
+ /* ---step 1. first ffn layer--- */
+ cublasLtMM_withAlgo_i8IO(
+ _int8_ffn_out_buf, 1, _batch_token_num, _tw._inner_size, _tw._hidden_size,
+ 0, 0, 0,
+ _enc_clip_max[_layer_id * 11 + 2] * _enc_clip_max[_layer_id * 11 + 6] /
+ (_enc_clip_max[_layer_id * 11 + 10] * _quant_range),
+ _int8_ffn_in_buf, _int8_p_d_enc_wei[_layer_id * 4 + 2], _cublas_lt_handle,
+ _stream, false);
+
+ if (_tw._use_gelu) {
+ ker_bias_gelu_i8I_i8O_launcher<_DataType>(
+ _batch_token_num, _stream, _int8_ffn_out_buf, _int8_ffn_in_buf,
+ _p_device_wei[_weight_offset + 9], _tw._inner_size,
+ _enc_clip_max[_layer_id * 11 + 10] / _quant_range,
+ _quant_range / _enc_clip_max[_layer_id * 11 + 7], true, true);
+ } else {
+ ker_bias_relu_i8I_i8O_launcher<_DataType>(
+ _batch_token_num, _stream, _int8_ffn_out_buf, _int8_ffn_in_buf,
+ _p_device_wei[_weight_offset + 9], _tw._inner_size,
+ _enc_clip_max[_layer_id * 11 + 10] / _quant_range,
+ _quant_range / _enc_clip_max[_layer_id * 11 + 7],
+ _enc_clip_max[_layer_id * 11 + 7], true, true, true);
+ }
+
+#ifdef DEBUG_RESULT
+ for (int i = 0; i < _batch_size; i++) { // batch_id
+ for (int j = 0; j < _batch_seq_len; j++) { // token_id
+ std::cout << "ffn2 input: token-" << j << std::endl;
+ print_vec(_int8_ffn_in_buf + i * _batch_seq_len * _tw._inner_size +
+ j * _tw._inner_size,
+ "ffn2 input", 10);
+ }
+ }
+#endif
+
+ /* ---step 2. second ffn layer--- */
+ cublasLtMM_withAlgo(_int32_ffn_out_buf, 1, _batch_token_num, _tw._hidden_size,
+ _tw._inner_size, 0, 0, 0, _int8_ffn_in_buf,
+ _int8_p_d_enc_wei[_layer_id * 4 + 3], _cublas_lt_handle,
+ _stream, false);
+
+ const _DataType *scale_ptr, *bias_ptr, *res_bias_ptr;
+ float clip_max, dequant_scale;
+ if (_tw._use_gelu) {
+ dequant_scale = _enc_clip_max[_layer_id * 11 + 3] *
+ _enc_clip_max[_layer_id * 11 + 7] /
+ (_quant_range * _quant_range);
+ } else {
+ dequant_scale = _enc_clip_max[_layer_id * 11 + 3] *
+ _enc_clip_max[_layer_id * 11 + 7] /
+ (2 * _quant_range * _quant_range);
+ }
+ if (_layer_id == _tw._n_enc_layer - 1) {
+ scale_ptr = _p_device_emb[2];
+ bias_ptr = _p_device_emb[3];
+
+ ker_residual_bias_ln_i32I_launcher<_DataType>(
+ _int32_ffn_out_buf, scale_ptr, bias_ptr, _p_d_output, _p_d_output,
+ _batch_token_num, _tw._hidden_size, dequant_scale,
+ _max_thread_per_block, _stream, true, _scaled_ffn2_colsum[_layer_id]);
+ } else {
+ scale_ptr = _p_device_wei[(_layer_id + 1) * _tw._weight_per_enc_layer];
+ bias_ptr = _p_device_wei[(_layer_id + 1) * _tw._weight_per_enc_layer + 1];
+ res_bias_ptr =
+ _p_device_wei[(_layer_id + 1) * _tw._weight_per_enc_layer + 5];
+ clip_max = _enc_clip_max[(_layer_id + 1) * 11 + 4];
+
+ ker_residual_bias_ln_i32I_i8O_launcher<_DataType>(
+ _int32_ffn_out_buf, scale_ptr, bias_ptr, res_bias_ptr, _int8_ffn_in_buf,
+ _p_d_output, _batch_token_num, _tw._hidden_size, dequant_scale,
+ _quant_range / clip_max, _max_thread_per_block, _stream,
+ _tw._is_post_ln, true, true, _scaled_ffn2_colsum[_layer_id]);
+
+#ifdef DEBUG_RESULT
+ for (int i = 0; i < _batch_size; i++) { // batch_id
+ for (int j = 0; j < _batch_seq_len; j++) { // token_id
+ std::cout << "encoder layer out: token-" << j << std::endl;
+ print_vec(_int8_ffn_in_buf + i * _batch_seq_len * _tw._hidden_size +
+ j * _tw._hidden_size,
+ "encoder layer out", 10);
+ }
+ }
+#endif
+ }
+
+ return;
+}
+
+template class QuantBertEncoder;
+template class QuantBertEncoder;
+
+} // namespace cuda
+} // namespace lightseq
diff --git a/lightseq/inference/model/quant_bert_encoder.h b/lightseq/inference/model/quant_bert_encoder.h
new file mode 100644
index 00000000..e8432a9d
--- /dev/null
+++ b/lightseq/inference/model/quant_bert_encoder.h
@@ -0,0 +1,109 @@
+#pragma once
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+#include
+#include
+
+#include "../proto/quant_bert_weight.h"
+#include "../tools/util.h"
+
+/**
+@file
+QuantBert encoder, composed by gemm lib and
+ custom cuda kernel function
+*/
+
+namespace lightseq {
+namespace cuda {
+
+template
+class QuantBertEncoder {
+ private:
+ typedef OperationTypeTraits _optraits;
+ typedef typename _optraits::DataType _DataType;
+ const cudaDataType_t _computeType = _optraits::computeType;
+ const cudaDataType_t _AType = _optraits::AType;
+ const cudaDataType_t _BType = _optraits::BType;
+ const cudaDataType_t _CType = _optraits::CType;
+
+ // private member function
+ void self_attention();
+ void ffn_add_norm();
+
+ const int _max_batch_size;
+ int *_p_d_padding_mask; // true sequence length(remove padding), [batch_size]
+
+ const int *_p_d_lang_id;
+ const QuantBertWeight &_tw;
+ cudaStream_t _stream;
+ cublasHandle_t _hd;
+ cublasLtHandle_t _cublas_lt_handle;
+ const _DataType _fone;
+ const _DataType _fzero;
+ const int32_t _ione;
+ const int32_t _izero;
+ const _DataType _atten_scaler;
+ const int _max_batch_dim;
+ const int _max_thread_per_block;
+
+ _DataType *_p_d_qkv_projected;
+ _DataType *_p_d_q;
+ _DataType *_p_d_k;
+ _DataType *_p_d_v;
+ _DataType *_p_d_c;
+ _DataType *_p_d_ffn_buf1;
+ _DataType *_p_d_ffn_buf2;
+
+ int8_t *_int8_ffn_in_buf;
+ int32_t *_int32_ffn_out_buf;
+ int8_t *_int8_ffn_out_buf;
+
+ // {token_emb, pos_emb, norm_scale, norm_bias}
+ const std::vector &_p_d_src_emb_wei;
+ // {multihead_norm_scale, multihead_norm_bias, multihead_qkv_kernel,
+ // multihead_qkv_bias multihead_output_kernel, multihead_output_bias
+ // ffn_norm_scale, ffn_norm_bias}
+ // ffn_first_kernel, ffn_first_bias, ffn_second_kernel, ffn_second_bias} *
+ // encoder_layer_num
+ const std::vector &_p_d_enc_wei;
+ std::vector _p_device_wei;
+ std::vector _p_device_emb;
+
+ std::vector _int8_p_d_enc_wei;
+ int8_t *_int8_p_d_src_emb_wei;
+ const float _quant_range = 127;
+ const float _src_emb_clip_max;
+ const std::vector _enc_clip_max; // size: 11 * enc_layer_num
+ std::vector<_DataType *> _scaled_ffn2_colsum;
+
+ int _batch_size;
+ int _batch_seq_len;
+ int _batch_token_num;
+ int _layer_id;
+ int _weight_offset;
+
+ public:
+ const int *_p_d_token_id; // input token id [batch_size, batch_seq_len]
+ _DataType
+ *_p_d_output; // encoder output, [batch_size, batch_seq_len, hidden_size]
+
+ QuantBertEncoder(int max_batch_size, const int *p_d_token_id,
+ int *p_d_padding_mask, _DataType *p_d_output,
+ const QuantBertWeight &tw, cudaStream_t stream,
+ cublasHandle_t hd, const int *p_d_lang_id = nullptr);
+ void init_buffer();
+ std::string check();
+ void run_one_infer(int batch_size, int batch_seq_len);
+};
+
+} // namespace cuda
+} // namespace lightseq
diff --git a/lightseq/inference/model/quant_decoder.cc.cu b/lightseq/inference/model/quant_decoder.cc.cu
index 1672d34f..9bc833ad 100644
--- a/lightseq/inference/model/quant_decoder.cc.cu
+++ b/lightseq/inference/model/quant_decoder.cc.cu
@@ -7,7 +7,7 @@
/**
@file
-Transformer decoder, composed by gemm lib and
+QuantTransformer decoder, composed by gemm lib and
custom cuda kernel function
*/
@@ -70,15 +70,6 @@ QuantDecoder::QuantDecoder(int max_batch_size,
return;
}
-/**
-Compute GPU memory size needed by transformer decoder,
- to see how these memory is used, checkout init_buffer() for detail
-*/
-template
-long QuantDecoder::compute_buffer_bytesize() {
- return 0;
-}
-
/**
Init the GPU memory pointer which point to
the memory buffer needed by decoder.
@@ -573,7 +564,7 @@ void QuantDecoder::embedding() {
_p_device_emb[7], _p_d_lang_id, _p_d_cur_step_query, _batch_size,
_tw._beam_size, _tw._hidden_size, _tw._trg_vocab_size, _cur_step,
_tw._max_step, _tw._multilg_type, _stream,
- _trg_emb_clip_max / _quant_range);
+ _trg_emb_clip_max / _quant_range, true);
#ifdef DEBUG_RESULT
for (int i = 0; i < _batch_size; i++) { // batch_id
for (int j = 0; j < _tw._beam_size; j++) { // beam_id
@@ -647,7 +638,7 @@ void QuantDecoder::self_attention() {
// get q, k, v by split and reshape qkv
- ker_arrange_decself_qkv_i8I_launcher<_DataType>(
+ ker_arrange_decself_qkv_i8I_i8O_launcher<_DataType>(
_step_token_num, _tw._hidden_size, _stream, _int8_ffn_out_buf,
_p_device_wei[_weight_offset + 3], _int8_ffn_in_buf,
_p_d_self_k_cache1[_layer_id], _p_d_self_v_cache1[_layer_id],
@@ -680,7 +671,7 @@ void QuantDecoder::self_attention() {
CHECK_GPU_ERROR(cudaGetLastError());
#endif
- ker_fuse_softmax_new_value_int8_launcher(
+ ker_fuse_softmax_new_value_i32I_i8O_launcher(
_int32_ffn_out_buf, _p_d_self_v_cache1[_layer_id], _int8_ffn_in_buf,
_step_token_num * _tw._head_num, _cur_step + 1, _tw._max_step,
_tw._head_num, _tw._dim_per_head, float(_atten_scaler),
@@ -849,7 +840,7 @@ void QuantDecoder::ffn_add_norm() {
_step_token_num, _stream, _int8_ffn_out_buf, _int8_ffn_in_buf,
_p_device_wei[_weight_offset + 15], _tw._inner_size,
_dec_clip_max[_layer_id * 19 + 16] / _quant_range,
- _quant_range / _dec_clip_max[_layer_id * 19 + 11], true);
+ _quant_range / _dec_clip_max[_layer_id * 19 + 11], true, false);
} else {
ker_bias_relu_i8I_i8O_launcher<_DataType>(
_step_token_num, _stream, _int8_ffn_out_buf, _int8_ffn_in_buf,
@@ -871,7 +862,16 @@ void QuantDecoder::ffn_add_norm() {
_tw._inner_size, 0, 0, 0, 1, _cublas_lt_handle, _stream);
const _DataType *scale_ptr, *bias_ptr, *res_bias_ptr;
- float clip_max;
+ float clip_max, dequant_scale;
+ if (_tw._use_gelu) {
+ dequant_scale = _dec_clip_max[_layer_id * 19 + 5] *
+ _dec_clip_max[_layer_id * 19 + 11] /
+ (_quant_range * _quant_range);
+ } else {
+ dequant_scale = _dec_clip_max[_layer_id * 19 + 5] *
+ _dec_clip_max[_layer_id * 19 + 11] /
+ (2 * _quant_range * _quant_range);
+ }
if (_layer_id == _tw._n_dec_layer - 1) {
scale_ptr = _p_device_emb[2];
bias_ptr = _p_device_emb[3];
@@ -887,9 +887,7 @@ void QuantDecoder::ffn_add_norm() {
ker_residual_bias_ln_i32I_i8O_launcher<_DataType>(
_int32_ffn_out_buf, scale_ptr, bias_ptr, res_bias_ptr, _int8_ffn_in_buf,
- _p_d_cur_step_query, _step_token_num, _tw._hidden_size,
- _dec_clip_max[_layer_id * 19 + 5] * _dec_clip_max[_layer_id * 19 + 11] /
- (2 * _quant_range * _quant_range),
+ _p_d_cur_step_query, _step_token_num, _tw._hidden_size, dequant_scale,
_quant_range / clip_max, _max_thread_per_block, _stream, _tw._is_post_ln,
false, true, _scaled_ffn2_colsum[_layer_id]);
@@ -906,22 +904,23 @@ void QuantDecoder::ffn_add_norm() {
template
bool QuantDecoder::sample() {
- throw std::runtime_error("QuantDecoder sample() not implemented");
CHECK_GPU_ERROR(
cudaMemsetAsync(_p_d_sample_unfinished, 0, sizeof(int), _stream));
/* --- Sample new tokens from logits --- */
if (_tw._sampling_method == "topk") {
- ker_topk_sample_launcher<_DataType>(
+ ker_topk_sample_i8I_launcher<_DataType>(
_batch_size, (_cur_step + 1), _tw._max_step, 1, _max_thread_per_block,
- _stream, _p_d_logit_buf, _p_device_emb[6], _p_d_alive_seq,
+ _stream, _int8_ffn_out_buf, _p_device_emb[6], _p_d_alive_seq,
_p_d_alive_seq_buf, _tw._trg_vocab_size, _tw._topk,
- _p_d_sample_unfinished, _p_d_curandstate, _tw._end_id);
+ _p_d_sample_unfinished, _p_d_curandstate, _tw._end_id,
+ _logits_clip_max / _quant_range, true);
} else {
- ker_topp_sample_launcher<_DataType>(
+ ker_topp_sample_i8I_launcher<_DataType>(
_batch_size, (_cur_step + 1), _tw._max_step, 1, _max_thread_per_block,
- _stream, _p_d_logit_buf, _p_device_emb[6], _p_d_alive_seq,
+ _stream, _int8_ffn_out_buf, _p_device_emb[6], _p_d_alive_seq,
_p_d_alive_seq_buf, _tw._trg_vocab_size, _tw._topp,
- _p_d_sample_unfinished, _p_d_curandstate, _tw._end_id);
+ _p_d_sample_unfinished, _p_d_curandstate, _tw._end_id,
+ _logits_clip_max / _quant_range, true);
}
#ifdef DEBUG_RESULT
print_vec(_p_d_sample_unfinished, "unfinished flag", 1);
@@ -1054,7 +1053,6 @@ void QuantDecoder::update_new_seq_probs() {
template
bool QuantDecoder::topk_greedy_search() {
- throw std::runtime_error("QuantDecoder topk_greedy_search() not implemented");
_tw._diverse_lambda = 0;
if (_cur_step == 0) {
return beam_search();
@@ -1063,11 +1061,11 @@ bool QuantDecoder::topk_greedy_search() {
CHECK_GPU_ERROR(
cudaMemsetAsync(_p_d_sample_unfinished, 0, sizeof(int), _stream));
/* --- Sample new tokens from logits --- */
- ker_topk_sample_launcher<_DataType>(
+ ker_topk_sample_i8I_launcher<_DataType>(
_step_token_num, (_cur_step + 1), _tw._max_step, 1, _max_thread_per_block,
- _stream, _p_d_logit_buf, _p_device_emb[6], _p_d_alive_seq,
+ _stream, _int8_ffn_out_buf, _p_device_emb[6], _p_d_alive_seq,
_p_d_alive_seq_buf, _tw._trg_vocab_size, 1, _p_d_sample_unfinished,
- _p_d_curandstate, _tw._end_id);
+ _p_d_curandstate, _tw._end_id, _logits_clip_max / _quant_range, true);
#ifdef DEBUG_RESULT
print_vec(_p_d_sample_unfinished, "unfinished flag", 1);
diff --git a/lightseq/inference/model/quant_decoder.h b/lightseq/inference/model/quant_decoder.h
index 63682766..9274e0fb 100644
--- a/lightseq/inference/model/quant_decoder.h
+++ b/lightseq/inference/model/quant_decoder.h
@@ -20,7 +20,7 @@
/**
@file
-Transformer decoder, composed by gemm lib and
+QuantTransformer decoder, composed by gemm lib and
custom cuda kernel function
*/
namespace lightseq {
@@ -101,7 +101,6 @@ class QuantDecoder {
_DataType* _p_d_query_buf2;
_DataType* _p_d_c;
_DataType* _p_d_encoder_out_buf;
- _DataType* _p_d_logit_buf;
int8_t* _int8_ffn_in_buf;
int32_t* _int32_ffn_out_buf;
@@ -159,7 +158,6 @@ class QuantDecoder {
QuantTransformerWeight& tw, cudaStream_t stream,
cublasHandle_t hd, bool output_topk = false,
const int* p_d_lang_id = nullptr);
- long compute_buffer_bytesize();
void init_buffer();
std::string check();
void run_one_infer(int batch_size, int batch_seq_len);
diff --git a/lightseq/inference/model/quant_encoder.cc.cu b/lightseq/inference/model/quant_encoder.cc.cu
index 3f9d2b9d..075bccf9 100644
--- a/lightseq/inference/model/quant_encoder.cc.cu
+++ b/lightseq/inference/model/quant_encoder.cc.cu
@@ -7,7 +7,7 @@
/**
@file
-Transformer encoder, composed by gemm lib and
+QuantTransformer encoder, composed by gemm lib and
custom cuda kernel function
*/
@@ -45,19 +45,6 @@ QuantEncoder::QuantEncoder(int max_batch_size, int *p_d_token_id,
CHECK_GPU_ERROR(cublasLtCreate(&_cublas_lt_handle));
}
-/**
-Compute GPU memory size needed by transformer encoder,
- to see how these memory is used, checkout init_buffer() for detail
-*/
-template
-long QuantEncoder::compute_buffer_bytesize() {
- // long sz1 = _max_batch_dim * 6 +
- // _max_batch_size * _tw._head_num * _tw._max_step * _tw._max_step;
- // long sz2 = _max_batch_dim + _max_batch_size * _tw._max_step *
- // _tw._inner_size; return max(sz1, sz2) * sizeof(_DataType);
- return 0;
-}
-
/**
Init the GPU memory pointer which point to
the memory buffer needed by encoder.
@@ -89,9 +76,10 @@ void QuantEncoder::init_buffer() {
CHECK_GPU_ERROR(
cudaMalloc(&_int8_p_d_src_emb_wei,
_tw._src_vocab_size * _tw._hidden_size * sizeof(int8_t)));
- quantize_weight(_p_d_src_emb_wei[0], _int8_p_d_src_emb_wei, _tw._hidden_size,
- _tw._src_vocab_size, _quant_range / _src_emb_clip_max,
- _stream, _cublas_lt_handle, kRowMajor);
+ quantize_weight(_p_d_src_emb_wei[0], _int8_p_d_src_emb_wei,
+ _tw._src_vocab_size, _tw._hidden_size,
+ _quant_range / _src_emb_clip_max, _stream, _cublas_lt_handle,
+ kRowMajor);
_p_device_emb.push_back(nullptr);
_p_device_emb.push_back(
@@ -247,7 +235,7 @@ void QuantEncoder::run_one_infer(int batch_size, int batch_seq_len) {
_int8_p_d_src_emb_wei, _p_device_emb[1], _p_d_token_id, _p_d_output,
_p_d_padding_mask, _tw._padding_id, batch_size, batch_seq_len,
_tw._hidden_size, _stream, _p_device_emb[4], _p_d_lang_id,
- _tw._multilg_type, _src_emb_clip_max / _quant_range);
+ _tw._multilg_type, _src_emb_clip_max / _quant_range, true);
#ifdef DEBUG_RESULT
for (int i = 0; i < _batch_size; i++) { // batch_id
for (int j = 0; j < _batch_seq_len; j++) { // token_id
@@ -356,7 +344,7 @@ void QuantEncoder::self_attention() {
_int8_ffn_in_buf, _p_d_output, _batch_token_num, _tw._hidden_size,
_enc_clip_max[_layer_id * 12 + 9] / _quant_range,
_quant_range / _enc_clip_max[_layer_id * 12 + 6], _max_thread_per_block,
- _stream, _tw._is_post_ln, true);
+ _stream, _tw._is_post_ln, true, true);
return;
}
@@ -376,7 +364,7 @@ void QuantEncoder::ffn_add_norm() {
_batch_token_num, _stream, _int8_ffn_out_buf, _int8_ffn_in_buf,
_p_device_wei[_weight_offset + 9], _tw._inner_size,
_enc_clip_max[_layer_id * 12 + 10] / _quant_range,
- _quant_range / _enc_clip_max[_layer_id * 12 + 7], true);
+ _quant_range / _enc_clip_max[_layer_id * 12 + 7], true, true);
} else {
ker_bias_relu_i8I_i8O_launcher<_DataType>(
_batch_token_num, _stream, _int8_ffn_out_buf, _int8_ffn_in_buf,
@@ -393,16 +381,23 @@ void QuantEncoder::ffn_add_norm() {
_stream, false);
const _DataType *scale_ptr, *bias_ptr, *res_bias_ptr;
- float clip_max;
+ float clip_max, dequant_scale;
+ if (_tw._use_gelu) {
+ dequant_scale = _enc_clip_max[_layer_id * 12 + 3] *
+ _enc_clip_max[_layer_id * 12 + 7] /
+ (_quant_range * _quant_range);
+ } else {
+ dequant_scale = _enc_clip_max[_layer_id * 12 + 3] *
+ _enc_clip_max[_layer_id * 12 + 7] /
+ (2 * _quant_range * _quant_range);
+ }
if (_layer_id == _tw._n_enc_layer - 1) {
scale_ptr = _p_device_emb[2];
bias_ptr = _p_device_emb[3];
ker_residual_bias_ln_i32I_launcher<_DataType>(
_int32_ffn_out_buf, scale_ptr, bias_ptr, _p_d_output, _p_d_output,
- _batch_token_num, _tw._hidden_size,
- _enc_clip_max[_layer_id * 12 + 3] * _enc_clip_max[_layer_id * 12 + 7] /
- (2 * _quant_range * _quant_range),
+ _batch_token_num, _tw._hidden_size, dequant_scale,
_max_thread_per_block, _stream, true, _scaled_ffn2_colsum[_layer_id]);
} else {
scale_ptr = _p_device_wei[(_layer_id + 1) * _tw._weight_per_enc_layer];
@@ -413,9 +408,7 @@ void QuantEncoder::ffn_add_norm() {
ker_residual_bias_ln_i32I_i8O_launcher<_DataType>(
_int32_ffn_out_buf, scale_ptr, bias_ptr, res_bias_ptr, _int8_ffn_in_buf,
- _p_d_output, _batch_token_num, _tw._hidden_size,
- _enc_clip_max[_layer_id * 12 + 3] * _enc_clip_max[_layer_id * 12 + 7] /
- (2 * _quant_range * _quant_range),
+ _p_d_output, _batch_token_num, _tw._hidden_size, dequant_scale,
_quant_range / clip_max, _max_thread_per_block, _stream,
_tw._is_post_ln, true, true, _scaled_ffn2_colsum[_layer_id]);
}
diff --git a/lightseq/inference/model/quant_encoder.h b/lightseq/inference/model/quant_encoder.h
index d14f3fd0..0d77114b 100644
--- a/lightseq/inference/model/quant_encoder.h
+++ b/lightseq/inference/model/quant_encoder.h
@@ -18,7 +18,7 @@
/**
@file
-Transformer decoder, composed by gemm lib and
+QuantTransformer encoder, composed by gemm lib and
custom cuda kernel function
*/
@@ -99,7 +99,6 @@ class QuantEncoder {
_DataType *p_d_output, const QuantTransformerWeight &tw,
cudaStream_t stream, cublasHandle_t hd,
const int *p_d_lang_id = nullptr);
- long compute_buffer_bytesize();
void init_buffer();
std::string check();
void run_one_infer(int batch_size, int batch_seq_len);
diff --git a/lightseq/inference/model/quant_gpt_encoder.cc.cu b/lightseq/inference/model/quant_gpt_encoder.cc.cu
new file mode 100644
index 00000000..26f1b5e8
--- /dev/null
+++ b/lightseq/inference/model/quant_gpt_encoder.cc.cu
@@ -0,0 +1,769 @@
+#include "../kernels/gptKernels.h"
+#include "../kernels/gptKernels_int8.h"
+#include "../kernels/transformerKernels.h"
+#include "../kernels/transformerKernels_int8.h"
+#include "quant_gpt_encoder.h"
+#include "cublas_helper.h"
+
+/**
+@file
+QuantGPT encoder, composed by gemm lib and
+ custom cuda kernel function
+*/
+
+// #define DEBUG_RESULT
+
+namespace lightseq {
+namespace cuda {
+
+template
+QuantGptEncoder::QuantGptEncoder(
+ int max_batch_size, const int *p_d_token_id, float *p_d_ppl,
+ int *p_d_sample_id, const QuantGptWeight &tw, cudaStream_t stream,
+ cudaStream_t cache_stream, cublasHandle_t hd)
+ : _max_batch_size(max_batch_size),
+ _p_d_token_id(p_d_token_id),
+ _p_d_ppl(p_d_ppl),
+ _p_d_sample_id(p_d_sample_id),
+ _tw(tw),
+ _stream(stream),
+ _cache_stream(cache_stream),
+ _hd(hd),
+ _p_d_src_emb_wei(tw.get_src_emb_wei()),
+ _p_d_enc_wei(tw.get_enc_wei()),
+ _fone((_DataType)1.f),
+ _fzero((_DataType)0.f),
+ _src_emb_clip_max(tw.get_src_emb_clip_max()),
+ _output_ln_clip_max(tw.get_output_ln_clip_max()),
+ _logits_clip_max(tw.get_logits_clip_max()),
+ _enc_clip_max(tw.get_enc_clip_max()),
+ _ione((int32_t)1),
+ _izero((int32_t)0),
+ _atten_scaler((_DataType)sqrt(1.f / tw._dim_per_head)),
+ _max_batch_dim(max_batch_size * tw._max_step * tw._hidden_size),
+ _max_thread_per_block(1024),
+ _h_real_seq_len(max_batch_size, 0),
+ _h_ppl(max_batch_size, 0.f),
+ _h_sample_id(max_batch_size * tw._max_step, 0),
+ _h_unfinished(1) {
+ CHECK_GPU_ERROR(cublasLtCreate(&_cublas_lt_handle));
+}
+
+/**
+Init the GPU memory pointer which point to
+ the memory buffer needed by encoder.
+These buffer are used during custom cuda kernel function,
+ find the corresponding function to see how these buffer are used
+*/
+template
+void QuantGptEncoder::init_buffer() {
+ CHECK_GPU_ERROR(
+ cudaMalloc(&_p_d_real_seq_len, _max_batch_size * sizeof(int)));
+ CHECK_GPU_ERROR(cudaMalloc(&_p_d_query, _max_batch_dim * sizeof(_DataType)));
+ CHECK_GPU_ERROR(cudaMalloc((void **)&_p_d_curandstate,
+ _max_batch_size * sizeof(curandState)));
+ CHECK_GPU_ERROR(cudaMalloc((void **)&_p_d_sample_id_buf,
+ _max_batch_size * _tw._max_step * sizeof(int)));
+ CHECK_GPU_ERROR(cudaMalloc((void **)&_p_d_unfinished, sizeof(int)));
+ ker_curand_setup<<<_max_batch_size, 1, 0, _stream>>>(_p_d_curandstate);
+
+ _DataType *qkv_buf;
+ CHECK_GPU_ERROR(cudaMalloc(&qkv_buf, 3 * _max_batch_dim * sizeof(_DataType)));
+ _p_d_q = qkv_buf;
+ _p_d_k = qkv_buf + _max_batch_dim;
+ _p_d_v = qkv_buf + 2 * _max_batch_dim;
+
+ int max_attn_score_dim = round_up(
+ _max_batch_size * _tw._head_num * _tw._max_step * _tw._max_step, 32);
+
+ CHECK_GPU_ERROR(cudaMalloc(&_p_d_c, max_attn_score_dim * sizeof(_DataType)));
+
+ int max_batch_dim =
+ _max_batch_size * _tw._max_step *
+ round_up(std::max(_tw._inner_size, _tw._hidden_size * 3), 32);
+ CHECK_GPU_ERROR(
+ cudaMalloc(&_int8_ffn_in_buf, max_batch_dim * sizeof(int8_t)));
+ CHECK_GPU_ERROR(cudaMalloc(
+ &_int32_ffn_out_buf,
+ std::max(max_batch_dim, max_attn_score_dim) * sizeof(int32_t)));
+ CHECK_GPU_ERROR(
+ cudaMalloc(&_int8_ffn_out_buf,
+ std::max(max_batch_dim, round_up(_tw._src_vocab_size, 32) *
+ _tw._max_step * _max_batch_size) *
+ sizeof(int8_t)));
+
+ // malloc embeddings
+ CHECK_GPU_ERROR(
+ cudaMalloc(&_int8_p_d_src_emb_wei,
+ _tw._src_vocab_size * _tw._hidden_size * sizeof(int8_t)));
+ quantize_weight(_p_d_src_emb_wei[0], _int8_p_d_src_emb_wei, _tw._hidden_size,
+ _tw._src_vocab_size, _quant_range / _src_emb_clip_max,
+ _stream, _cublas_lt_handle);
+ CHECK_GPU_ERROR(
+ cudaMalloc(&_int8_p_d_src_emb_bottom_wei,
+ _tw._src_vocab_size * _tw._hidden_size * sizeof(int8_t)));
+ quantize_weight(_p_d_src_emb_wei[0], _int8_p_d_src_emb_bottom_wei,
+ _tw._hidden_size, _tw._src_vocab_size,
+ _quant_range / _src_emb_clip_max, _stream, _cublas_lt_handle,
+ kColMajor);
+ _p_device_emb.push_back(nullptr);
+ _p_device_emb.push_back(
+ to_gpu(_p_d_src_emb_wei[1], _tw._max_step * _tw._hidden_size, _stream));
+ _p_device_emb.push_back(
+ to_gpu(_p_d_src_emb_wei[2], _tw._hidden_size, _stream));
+ _p_device_emb.push_back(
+ to_gpu(_p_d_src_emb_wei[3], _tw._hidden_size, _stream));
+
+ // malloc reused kv cache max size: _tw._hidden_size * 2 * _tw._n_enc_layer *
+ // _max_batch_size * _max_step * sizeof(T)
+ int8_t *self_kv_cache_buffer;
+ int8_t *sliding_p;
+ CHECK_GPU_ERROR(
+ cudaMalloc(&self_kv_cache_buffer,
+ _max_batch_dim * _tw._n_enc_layer * 4 * sizeof(int8_t)));
+
+ sliding_p = self_kv_cache_buffer;
+ for (int i = 0; i < _tw._n_enc_layer * 2; i++) {
+ _p_d_self_k_cache.push_back(sliding_p);
+ sliding_p += _max_batch_dim;
+ }
+ for (int i = 0; i < _tw._n_enc_layer * 2; i++) {
+ _p_d_self_v_cache.push_back(sliding_p);
+ sliding_p += _max_batch_dim;
+ }
+ _p_d_self_k_cache1 = _p_d_self_k_cache.data();
+ _p_d_self_k_cache2 = _p_d_self_k_cache.data() + _tw._n_enc_layer;
+ _p_d_self_v_cache1 = _p_d_self_v_cache.data();
+ _p_d_self_v_cache2 = _p_d_self_v_cache.data() + _tw._n_enc_layer;
+
+ // malloc weights
+ _int8_p_d_enc_wei = std::vector(_tw._n_enc_layer * 4);
+ _scaled_ffn2_colsum = std::vector<_DataType *>(_tw._n_enc_layer);
+ for (_layer_id = 0; _layer_id < _tw._n_enc_layer; _layer_id++) {
+ _weight_offset = _layer_id * _tw._weight_per_enc_layer;
+ // malloc quantized weights
+ CHECK_GPU_ERROR(
+ cudaMalloc(&_int8_p_d_enc_wei[_layer_id * 4],
+ _tw._hidden_size * 3 * _tw._hidden_size * sizeof(int8_t)));
+ CHECK_GPU_ERROR(
+ cudaMalloc(&_int8_p_d_enc_wei[_layer_id * 4 + 1],
+ _tw._hidden_size * _tw._hidden_size * sizeof(int8_t)));
+ CHECK_GPU_ERROR(
+ cudaMalloc(&_int8_p_d_enc_wei[_layer_id * 4 + 2],
+ _tw._hidden_size * _tw._inner_size * sizeof(int8_t)));
+ CHECK_GPU_ERROR(
+ cudaMalloc(&_int8_p_d_enc_wei[_layer_id * 4 + 3],
+ _tw._inner_size * _tw._hidden_size * sizeof(int8_t)));
+
+ // malloc unquantized weights
+ _p_device_wei.push_back(
+ to_gpu(_p_d_enc_wei[_weight_offset], _tw._hidden_size, _stream));
+ _p_device_wei.push_back(
+ to_gpu(_p_d_enc_wei[_weight_offset + 1], _tw._hidden_size, _stream));
+ _p_device_wei.push_back(nullptr);
+ _p_device_wei.push_back(to_gpu(_p_d_enc_wei[_weight_offset + 3],
+ _tw._hidden_size * 3, _stream));
+ _p_device_wei.push_back(nullptr);
+ _p_device_wei.push_back(
+ to_gpu(_p_d_enc_wei[_weight_offset + 5], _tw._hidden_size, _stream));
+ _p_device_wei.push_back(
+ to_gpu(_p_d_enc_wei[_weight_offset + 6], _tw._hidden_size, _stream));
+ _p_device_wei.push_back(
+ to_gpu(_p_d_enc_wei[_weight_offset + 7], _tw._hidden_size, _stream));
+ _p_device_wei.push_back(nullptr);
+ _p_device_wei.push_back(
+ to_gpu(_p_d_enc_wei[_weight_offset + 9], _tw._inner_size, _stream));
+ _p_device_wei.push_back(nullptr);
+ _p_device_wei.push_back(
+ to_gpu(_p_d_enc_wei[_weight_offset + 11], _tw._hidden_size, _stream));
+
+ quantize_weight(_p_d_enc_wei[_weight_offset + 2],
+ _int8_p_d_enc_wei[_layer_id * 4], _tw._hidden_size,
+ _tw._hidden_size * 3,
+ _quant_range / _enc_clip_max[_layer_id * 12], _stream,
+ _cublas_lt_handle);
+
+ quantize_weight(_p_d_enc_wei[_weight_offset + 4],
+ _int8_p_d_enc_wei[_layer_id * 4 + 1], _tw._hidden_size,
+ _tw._hidden_size,
+ _quant_range / _enc_clip_max[_layer_id * 12 + 1], _stream,
+ _cublas_lt_handle, kColMajor);
+
+ quantize_weight(_p_d_enc_wei[_weight_offset + 8],
+ _int8_p_d_enc_wei[_layer_id * 4 + 2], _tw._hidden_size,
+ _tw._inner_size,
+ _quant_range / _enc_clip_max[_layer_id * 12 + 2], _stream,
+ _cublas_lt_handle);
+
+ quantize_weight(_p_d_enc_wei[_weight_offset + 10],
+ _int8_p_d_enc_wei[_layer_id * 4 + 3], _tw._inner_size,
+ _tw._hidden_size,
+ _quant_range / _enc_clip_max[_layer_id * 12 + 3], _stream,
+ _cublas_lt_handle, kColMajor);
+
+ _scaled_ffn2_colsum[_layer_id] = nullptr;
+ }
+
+ CHECK_GPU_ERROR(cudaStreamSynchronize(_stream));
+ CHECK_GPU_ERROR(cudaGetLastError());
+ std::cout << "quantized encoder buffer init succeed" << std::endl;
+
+ return;
+}
+
+/**
+Some requirements needed by custom cuda kernel function
+*/
+template
+std::string QuantGptEncoder::check() {
+ // if (_max_thread_per_block < _tw._hidden_size) {
+ // return "violate hidden_size <= max_thread_per_block";
+ // }
+ if (_tw._inner_size & 1) {
+ return "violate inner_size % 2 = 0";
+ }
+ if (_tw._dim_per_head & 1) {
+ return "violate dim_per_head % 2 = 0";
+ }
+ if (_p_d_src_emb_wei.size() != 4) {
+ return "violate p_d_src_emb_wei.size() = 4";
+ }
+ if (_p_d_enc_wei.size() != _tw._weight_per_enc_layer * _tw._n_enc_layer) {
+ return "violate p_d_enc_wei.size() = weight_per_enc_layer * n_enc_layer";
+ }
+ std::string sampling_method = _tw._sampling_method;
+ if (kSamplingMethods.find(sampling_method) == kSamplingMethods.end()) {
+ return std::string("unsupported sampling_method: ") + sampling_method;
+ }
+
+ if (_tw._topk <= 0) {
+ return "topk must be positive";
+ }
+ if (_tw._topp <= 0 && _tw._topp >= 1.0) {
+ return "topp must be in (0, 1)";
+ }
+
+ return "";
+}
+
+template
+void QuantGptEncoder::run_one_infer(int batch_size,
+ int batch_seq_len) {
+ if (batch_size > _max_batch_size) {
+ throw std::runtime_error("batch size of input greater than max_batch_size");
+ }
+ if (batch_seq_len > _tw._max_step) {
+ throw std::runtime_error("seq len of input greater than max_step");
+ }
+ _batch_size = batch_size;
+ _batch_seq_len = batch_seq_len;
+ _batch_token_num = batch_size * batch_seq_len;
+ CHECK_GPU_ERROR(cudaMemcpyAsync(_p_d_real_seq_len, _h_real_seq_len.data(),
+ sizeof(int) * _batch_size,
+ cudaMemcpyHostToDevice, _stream));
+ CHECK_GPU_ERROR(cudaMemcpyAsync(_p_d_ppl, _h_ppl.data(),
+ sizeof(float) * _batch_size,
+ cudaMemcpyHostToDevice, _stream));
+
+#ifdef DEBUG_RESULT
+ std::cout << "batch_size-" << batch_size << " batch_seq_len-" << batch_seq_len
+ << std::endl;
+ print_vec(_p_d_token_id, "batch_token_ids", batch_size * batch_seq_len);
+#endif
+
+ // token embedding, add position embedding and layer_norm
+ ker_gpt_embedding_i8I_launcher<_DataType>(
+ batch_size, batch_seq_len, _tw._hidden_size, _stream,
+ _int8_p_d_src_emb_bottom_wei, _p_device_emb[1], _p_d_token_id, _p_d_query,
+ _p_d_real_seq_len, _tw._padding_id, 0, _src_emb_clip_max / _quant_range);
+
+ for (_layer_id = 0; _layer_id < _tw._n_enc_layer; _layer_id++) {
+ _weight_offset = _layer_id * _tw._weight_per_enc_layer;
+ self_attention();
+ ffn_add_norm();
+ }
+
+ compute_ppl();
+ return;
+}
+
+template
+int QuantGptEncoder::run_one_sample(int batch_size,
+ int batch_seq_len) {
+ if (batch_size > _max_batch_size) {
+ throw std::runtime_error("batch size of input greater than max_batch_size");
+ }
+ if (batch_seq_len > _tw._max_step) {
+ throw std::runtime_error("seq len of input greater than max_step");
+ }
+ _batch_size = batch_size;
+ _batch_seq_len = batch_seq_len;
+ _batch_token_num = batch_size * batch_seq_len;
+
+ CHECK_GPU_ERROR(cudaMemcpyAsync(_p_d_real_seq_len, _h_real_seq_len.data(),
+ sizeof(int) * _batch_size,
+ cudaMemcpyHostToDevice, _stream));
+ CHECK_GPU_ERROR(cudaMemcpyAsync(_p_d_ppl, _h_ppl.data(),
+ sizeof(float) * _batch_size,
+ cudaMemcpyHostToDevice, _stream));
+ CHECK_GPU_ERROR(cudaMemcpyAsync(_p_d_sample_id, _p_d_token_id,
+ sizeof(int) * _batch_size * _batch_seq_len,
+ cudaMemcpyDeviceToDevice, _stream));
+#ifdef DEBUG_RESULT
+ std::cout << "batch_size-" << batch_size << " batch_seq_len-" << batch_seq_len
+ << std::endl;
+ std::cout << "Sample with " << _tw._sampling_method << std::endl;
+ std::cout << "padding_id: " << _tw._padding_id << std::endl;
+ std::cout << "vocab_size: " << _tw._src_vocab_size << std::endl;
+ print_vec(_p_d_sample_id, "batch_token_ids", batch_size * batch_seq_len);
+#endif
+
+ // token embedding, add position embedding and layer_norm
+ ker_gpt_embedding_i8I_launcher<_DataType>(
+ _batch_size, _batch_seq_len, _tw._hidden_size, _stream,
+ _int8_p_d_src_emb_bottom_wei, _p_device_emb[1], _p_d_sample_id,
+ _p_d_query, _p_d_real_seq_len, _tw._padding_id, 0,
+ _src_emb_clip_max / _quant_range);
+
+ for (_layer_id = 0; _layer_id < _tw._n_enc_layer; _layer_id++) {
+ _weight_offset = _layer_id * _tw._weight_per_enc_layer;
+ self_attention();
+ ffn_add_norm();
+ }
+
+ int8_t **ftmp = _p_d_self_k_cache2;
+ _p_d_self_k_cache2 = _p_d_self_k_cache1;
+ _p_d_self_k_cache1 = ftmp;
+ ftmp = _p_d_self_v_cache2;
+ _p_d_self_v_cache2 = _p_d_self_v_cache1;
+ _p_d_self_v_cache1 = ftmp;
+
+ if (sample_one_token() == 0 || _batch_seq_len >= _tw._max_step) {
+ CHECK_GPU_ERROR(cudaMemcpyAsync(_p_d_sample_id_buf, _p_d_sample_id,
+ _batch_token_num * sizeof(int),
+ cudaMemcpyDeviceToDevice, _stream));
+ CHECK_GPU_ERROR(cudaStreamSynchronize(_stream));
+ return _batch_seq_len;
+ }
+
+ while (1) {
+#ifdef DEBUG_RESULT
+ std::cout << "before sample:batch_size-" << _batch_size << " batch_seq_len-"
+ << _batch_seq_len << std::endl;
+ print_vec(_p_d_sample_id, "batch_token_ids", _batch_token_num);
+#endif
+
+ // token embedding, add position embedding and layer_norm
+ ker_gpt_embedding_i8I_launcher<_DataType>(
+ batch_size, 1, _tw._hidden_size, _stream, _int8_p_d_src_emb_bottom_wei,
+ _p_device_emb[1], _p_d_last_sample_id, _p_d_query, _p_d_real_seq_len,
+ _tw._padding_id, _batch_seq_len - 1, _src_emb_clip_max / _quant_range);
+
+ for (_layer_id = 0; _layer_id < _tw._n_enc_layer; _layer_id++) {
+ _weight_offset = _layer_id * _tw._weight_per_enc_layer;
+ self_attention_with_cache();
+ ffn_add_norm_with_cache();
+ }
+
+ int8_t **ftmp = _p_d_self_k_cache2;
+ _p_d_self_k_cache2 = _p_d_self_k_cache1;
+ _p_d_self_k_cache1 = ftmp;
+ ftmp = _p_d_self_v_cache2;
+ _p_d_self_v_cache2 = _p_d_self_v_cache1;
+ _p_d_self_v_cache1 = ftmp;
+
+ if (sample_one_token_with_cache() == 0 || _batch_seq_len >= _tw._max_step)
+ break;
+ }
+
+ CHECK_GPU_ERROR(cudaMemcpyAsync(_p_d_sample_id_buf, _p_d_sample_id,
+ _batch_token_num * sizeof(int),
+ cudaMemcpyDeviceToDevice, _stream));
+ CHECK_GPU_ERROR(cudaStreamSynchronize(_stream));
+
+ return _batch_seq_len;
+}
+
+template
+int QuantGptEncoder::sample_one_token() {
+ /* ---step 1. project hidden states to vocab logits--- */
+ cublasLtMM_withAlgo_i8IO(_int8_ffn_out_buf, 1, _batch_token_num,
+ _tw._src_vocab_size, _tw._hidden_size, 0, 0, 0,
+ _output_ln_clip_max * _src_emb_clip_max /
+ (_logits_clip_max * _quant_range),
+ _int8_ffn_in_buf, _int8_p_d_src_emb_wei,
+ _cublas_lt_handle, _stream, false);
+ CHECK_GPU_ERROR(cudaMemsetAsync(_p_d_unfinished, 0, sizeof(int), _stream));
+ /* ---step 2. sample new tokens from logits */
+ if (_tw._sampling_method == "topk") {
+#ifdef DEBUG_RESULT
+ std::cout << "sampling using topk\n";
+#endif
+ ker_topk_sample_i8I_launcher(
+ _batch_size, _batch_seq_len, _batch_seq_len, _max_thread_per_block,
+ _stream, _int8_ffn_out_buf, _p_d_sample_id, _p_d_sample_id_buf,
+ _p_d_real_seq_len, _tw._src_vocab_size, _tw._topk, _p_d_unfinished,
+ _p_d_curandstate, _tw._eos_id, _logits_clip_max / _quant_range, true);
+ } else {
+#ifdef DEBUG_RESULT
+ std::cout << "sampling using topp\n";
+#endif
+ ker_topp_sample_i8I_launcher(
+ _batch_size, _batch_seq_len, _batch_seq_len, _max_thread_per_block,
+ _stream, _int8_ffn_out_buf, _p_d_sample_id, _p_d_sample_id_buf,
+ _p_d_real_seq_len, _tw._src_vocab_size, _tw._topp, _p_d_unfinished,
+ _p_d_curandstate, _tw._eos_id, _logits_clip_max / _quant_range, true);
+ }
+ int *temp = _p_d_sample_id;
+ _p_d_sample_id = _p_d_sample_id_buf;
+ _p_d_sample_id_buf = temp;
+ CHECK_GPU_ERROR(cudaMemcpyAsync(&_h_unfinished, _p_d_unfinished, sizeof(int),
+ cudaMemcpyDeviceToHost, _stream));
+ CHECK_GPU_ERROR(cudaStreamSynchronize(_stream));
+ _p_d_last_sample_id = _p_d_sample_id_buf + _batch_token_num;
+ _batch_seq_len++;
+ _batch_token_num += _batch_size;
+ return _h_unfinished;
+}
+
+template
+int QuantGptEncoder::sample_one_token_with_cache() {
+ /* ---step 1. project hidden states to vocab logits--- */
+ cublasLtMM_withAlgo_i8IO(_int8_ffn_out_buf, 1, _batch_size,
+ _tw._src_vocab_size, _tw._hidden_size, 0, 0, 0,
+ _output_ln_clip_max * _src_emb_clip_max /
+ (_logits_clip_max * _quant_range),
+ _int8_ffn_in_buf, _int8_p_d_src_emb_wei,
+ _cublas_lt_handle, _stream, false);
+
+ CHECK_GPU_ERROR(cudaMemsetAsync(_p_d_unfinished, 0, sizeof(int), _stream));
+ // /* ---step 2. sample new tokens from logits */
+ if (_tw._sampling_method == "topk") {
+#ifdef DEBUG_RESULT
+ std::cout << "sampling using topk\n";
+#endif
+ ker_topk_sample_i8I_launcher(
+ _batch_size, _batch_seq_len, 1, _max_thread_per_block, _stream,
+ _int8_ffn_out_buf, _p_d_sample_id, _p_d_sample_id_buf,
+ _p_d_real_seq_len, _tw._src_vocab_size, _tw._topk, _p_d_unfinished,
+ _p_d_curandstate, _tw._eos_id, _logits_clip_max / _quant_range, true);
+ } else {
+#ifdef DEBUG_RESULT
+ std::cout << "sampling using topp\n";
+#endif
+ ker_topp_sample_i8I_launcher(
+ _batch_size, _batch_seq_len, 1, _max_thread_per_block, _stream,
+ _int8_ffn_out_buf, _p_d_sample_id, _p_d_sample_id_buf,
+ _p_d_real_seq_len, _tw._src_vocab_size, _tw._topp, _p_d_unfinished,
+ _p_d_curandstate, _tw._eos_id, _logits_clip_max / _quant_range, true);
+ }
+ int *temp = _p_d_sample_id;
+ _p_d_sample_id = _p_d_sample_id_buf;
+ _p_d_sample_id_buf = temp;
+ CHECK_GPU_ERROR(cudaMemcpyAsync(&_h_unfinished, _p_d_unfinished, sizeof(int),
+ cudaMemcpyDeviceToHost, _stream));
+ CHECK_GPU_ERROR(cudaStreamSynchronize(_stream));
+ _p_d_last_sample_id = _p_d_sample_id_buf + _batch_token_num;
+ _batch_seq_len++;
+ _batch_token_num += _batch_size;
+ return _h_unfinished;
+}
+
+template
+void QuantGptEncoder::self_attention() {
+ /* ---step 0. layer_norm, add output_bias to "query"--- */
+ if (_layer_id == 0) {
+ ker_norm_layer_resual_i8O_launcher<_DataType>(
+ _batch_token_num, _tw._hidden_size, _stream, _p_d_query,
+ _int8_ffn_in_buf, _p_device_wei[_weight_offset],
+ _p_device_wei[_weight_offset + 1], _p_device_wei[_weight_offset + 5],
+ _max_thread_per_block, _quant_range / _enc_clip_max[_layer_id * 12 + 4],
+ false, true);
+ }
+
+ cublasLtMM_withAlgo_i8IO(
+ _int8_ffn_out_buf, 1, _batch_token_num, _tw._hidden_size * 3,
+ _tw._hidden_size, 0, 0, 0,
+ _enc_clip_max[_layer_id * 12] * _enc_clip_max[_layer_id * 12 + 4] /
+ (_enc_clip_max[_layer_id * 12 + 8] * _quant_range),
+ _int8_ffn_in_buf, _int8_p_d_enc_wei[_layer_id * 4], _cublas_lt_handle,
+ _stream, false);
+
+#ifdef DEBUG_RESULT
+ print_vec(_int8_ffn_in_buf, "attn qkv in", 20);
+ print_vec(_int8_p_d_enc_wei[_layer_id * 4], "attn qkv w", 20);
+ print_vec(_int8_ffn_out_buf, "attn qkv out", 20);
+#endif
+
+ // get q, k, v by split and reshape qkv
+ ker_arrange_encself_qkv_i8I_i8O_launcher<_DataType>(
+ _batch_token_num, _tw._hidden_size, _stream, _int8_ffn_out_buf,
+ _p_device_wei[_weight_offset + 3], _int8_ffn_in_buf,
+ _p_d_self_k_cache1[_layer_id], _p_d_self_v_cache1[_layer_id], _p_d_v,
+ _batch_seq_len, _tw._dim_per_head, _tw._head_num, _max_thread_per_block,
+ _enc_clip_max[_layer_id * 12 + 8] / _quant_range,
+ _quant_range / _enc_clip_max[_layer_id * 12 + 11], true);
+
+ /* ---step 2. correlation = q * k, perform softmax on correlation--- */
+ CHECK_GPU_ERROR(cublasGemmStridedBatchedEx(
+ _hd, CUBLAS_OP_T, CUBLAS_OP_N, _batch_seq_len, _batch_seq_len,
+ _tw._dim_per_head, &_ione, _p_d_self_k_cache1[_layer_id], CUDA_R_8I,
+ _tw._dim_per_head, _batch_seq_len * _tw._dim_per_head, _int8_ffn_in_buf,
+ CUDA_R_8I, _tw._dim_per_head, _batch_seq_len * _tw._dim_per_head, &_izero,
+ _int32_ffn_out_buf, CUDA_R_32I, _batch_seq_len,
+ _batch_seq_len * _batch_seq_len, _batch_size * _tw._head_num, CUDA_R_32I,
+ CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+
+ ker_correlation_softmax_gpt_i32I_launcher<_DataType>(
+ _batch_size, _batch_seq_len, _tw._head_num, _stream, _int32_ffn_out_buf,
+ _p_d_c, _p_d_real_seq_len, _atten_scaler,
+ _enc_clip_max[_layer_id * 12 + 11] / _quant_range);
+
+ /* ---step 3. new_q = correlation * v--- */
+ CHECK_GPU_ERROR(cublasGemmStridedBatchedEx(
+ _hd, CUBLAS_OP_N, CUBLAS_OP_N, _tw._dim_per_head, _batch_seq_len,
+ _batch_seq_len, &_fone, _p_d_v, _AType, _tw._dim_per_head,
+ _batch_seq_len * _tw._dim_per_head, _p_d_c, _BType, _batch_seq_len,
+ _batch_seq_len * _batch_seq_len, &_fzero, _p_d_q, _CType,
+ _tw._dim_per_head, _batch_seq_len * _tw._dim_per_head,
+ _batch_size * _tw._head_num, _computeType,
+ CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+
+ // use v to save reshaped q, since they are in same size and v
+ // will not be use again before the next multi-head-attention
+ ker_arrange_atten_output_i8O_launcher<_DataType>(
+ _batch_token_num, _tw._hidden_size, _stream, _p_d_q, _int8_ffn_in_buf,
+ _batch_seq_len, _tw._dim_per_head, _tw._head_num, _max_thread_per_block,
+ _quant_range / _enc_clip_max[_layer_id * 12 + 5], false);
+
+ /* ---step 4. new_q = ori_q + new_q * output_wei--- */
+ cublaslt_gemm(
+ _int8_p_d_enc_wei[_layer_id * 4 + 1], _int8_ffn_in_buf, _int8_ffn_out_buf,
+ 1, _tw._hidden_size, _batch_token_num, _tw._hidden_size, 0, 0, 0,
+ _enc_clip_max[_layer_id * 12 + 1] * _enc_clip_max[_layer_id * 12 + 5] /
+ (_enc_clip_max[_layer_id * 12 + 9] * _quant_range),
+ _cublas_lt_handle, _stream);
+
+#ifdef DEBUG_RESULT
+ print_vec(_int8_ffn_in_buf, "attn out in", 20);
+ print_vec(_int8_p_d_enc_wei[_layer_id * 4 + 1], "attn out w", 20);
+ print_vec(_int8_ffn_out_buf, "attn out out", 20);
+#endif
+
+ ker_residual_bias_ln_i8I_i8O_launcher<_DataType>(
+ _int8_ffn_out_buf, _p_device_wei[_weight_offset + 6],
+ _p_device_wei[_weight_offset + 7], _p_device_wei[_weight_offset + 11],
+ _int8_ffn_in_buf, _p_d_query, _batch_token_num, _tw._hidden_size,
+ _enc_clip_max[_layer_id * 12 + 9] / _quant_range,
+ _quant_range / _enc_clip_max[_layer_id * 12 + 6], _max_thread_per_block,
+ _stream, false, false, true);
+
+ return;
+}
+
+template