Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support QAT, export and inference for quantized BERT, GPT2 #285

Merged
merged 53 commits into from
May 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
53 commits
Select commit Hold shift + click to select a range
f391565
modify readme of examples
godweiyang Mar 15, 2022
5ed629a
modify table in example readme
godweiyang Mar 15, 2022
35adcfd
add cpp example of quant_transformer
godweiyang Mar 16, 2022
e8fa612
support huggingface bert ptq (stage 1)
godweiyang Mar 16, 2022
8c72f26
fix huggingface bert weight loading fp16 bug
godweiyang Mar 17, 2022
1d6c376
finetune quant bert from fp16 ckpt
godweiyang Mar 17, 2022
872b540
add emb quant of bert
godweiyang Mar 18, 2022
cf0caa6
add example of hf bert squad training, modify dir of huggingface trai…
godweiyang Mar 18, 2022
ae3ffbd
format
godweiyang Mar 18, 2022
6657d86
rename huggingface dir to fix conflict with datasets
godweiyang Mar 18, 2022
4a986ae
fix typo of gpt
godweiyang Mar 18, 2022
76aa5d8
export fairseq models to hdf5
godweiyang Mar 22, 2022
ebe1071
quant hdf5 load (stage 1)
godweiyang Mar 23, 2022
ee296bb
quant hdf5 transformer finished
godweiyang Mar 23, 2022
b964832
fix fairseq infer bug
godweiyang Mar 23, 2022
5f52dd6
export quant beert, delete hf quant pos emb
godweiyang Mar 24, 2022
63e90d9
add quant bert files
godweiyang Mar 24, 2022
d228cf2
support quant bert inference (not test)
godweiyang Mar 28, 2022
3400a1d
fix quant bert expoort name bug
godweiyang Mar 28, 2022
968f9ac
support quant bert inference
godweiyang Mar 30, 2022
df94d69
update black pre-coommit version
godweiyang Mar 30, 2022
6d3e74c
add quant bert test example
godweiyang Apr 6, 2022
0189252
support cpp quant bert example
godweiyang Apr 6, 2022
1078cc2
format
godweiyang Apr 6, 2022
33bb905
modify readme
godweiyang Apr 8, 2022
3bb9f73
do not use ffn2 out quant if using gelu
godweiyang Apr 8, 2022
fa7b8cb
polish gemm test
godweiyang Apr 11, 2022
e8912b7
fix gemm test lt col bug
godweiyang Apr 14, 2022
ff64270
support gpt2 qat
godweiyang Apr 18, 2022
c17fdbb
add causal mask for gpt encoder
godweiyang Apr 18, 2022
19dd24a
support quant gpt export
godweiyang Apr 19, 2022
88ae1d7
add quant gpt required files
godweiyang Apr 19, 2022
a594523
fix confict
godweiyang Apr 19, 2022
61cb0c4
support quant gpt inference (stage 1)
godweiyang Apr 20, 2022
436799d
fix conflict
godweiyang Apr 21, 2022
bc0a7d5
fix conflict
godweiyang Apr 21, 2022
c5f6aa2
add fake quant for logits gemm
godweiyang Apr 21, 2022
292cc3c
support quant gpt inference (stage 2)
godweiyang Apr 21, 2022
7ba1c6a
support quant gpt inference (stage 3)
godweiyang Apr 24, 2022
1ab6bfc
support quant gpt inference (ppl)
godweiyang Apr 25, 2022
ca9739b
support quant gpt inference (TODO: fix qkv bias out clip_max, sampling)
godweiyang Apr 25, 2022
56eb950
support quant gpt inference (ppl)
godweiyang Apr 26, 2022
d3a5807
support quant gpt inference (sampling)
godweiyang Apr 27, 2022
a37e20f
support quant decoder sampling
godweiyang Apr 27, 2022
305ef73
modify readme (add install command)
godweiyang Apr 27, 2022
c1141d8
optimizer quant gpt gemm, fix gelu bug
godweiyang Apr 27, 2022
7ff1fc4
optimize cpp example
godweiyang Apr 27, 2022
6a4c705
replace quant gpt cache memcpy with pointer wsitch
godweiyang Apr 28, 2022
e68bf8f
fuse quant gpt softmax kernel
godweiyang Apr 28, 2022
9e40037
optimize quant gpt arrange-qkv kernel
godweiyang Apr 28, 2022
dd71c87
modify PiPI spelling
godweiyang May 5, 2022
c687af2
Merge branch 'master' into opt-example
godweiyang May 5, 2022
8c4b81e
fix gpt memory spelling
godweiyang May 5, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 16 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ The following is a support matrix of LightSeq **inference** library compared wit
## Performance

### [>>> Training](./lightseq/training)
Here we present the experimental results on WMT14 English to German translation task based on Transformer-big models. We train Transformer models of different sizes on eight NVIDIA Tesla V100/NVIDIA Ampere A100 GPUs with data parallel and fp16 mixed precision.
Here we present the experimental results on WMT14 English to German translation task based on Transformer-big models. We train Transformer models of different sizes on eight NVIDIA Tesla V100/NVIDIA Tesla A100 GPUs with data parallel and fp16 mixed precision.
[Fairseq](https://github.com/pytorch/fairseq) with [Apex](https://github.com/NVIDIA/apex) is choosed as our baseline.

<img src="./docs/training/images/single_step.png" width="80%" aligned="middle">
Expand All @@ -66,6 +66,20 @@ More results is available [here](./docs/inference/performance.md).
## Quick Start
Complete user guide is available [here](docs/guide.md).

### Installation
You can install LightSeq from PyPI:
```shell
$ pip install lightseq
```

LightSeq installation from PyPI only supports Python 3.6 to 3.8 on Linux for now. Consider compiling from source if you have other environments:
```shell
$ PATH=/usr/local/hdf5/:$PATH ENABLE_FP32=0 ENABLE_DEBUG=0 pip install -e $PROJECT_DIR
```

Detailed building introduction is available [here](docs/inference/build.md).


### Fast training from Fairseq

You can experience lightning fast training by running following commands,
Expand Down Expand Up @@ -97,12 +111,10 @@ $ cd examples/inference/python
then you can check the performance by simply running following commands. `hf_bart_export.py` is used to transform pytorch weights to LightSeq protobuffer.

```shell
$ python export/hf_bart_export.py
$ python export/huggingface/hf_bart_export.py
$ python test/ls_bart.py
```

LightSeq installation from pypi only supports python 3.6 to 3.8 on Linux for now. Consider compiling from source if you have other environments.

More usage is available [here](./lightseq/inference/README.md).

### Fast deploy inference server
Expand Down
2 changes: 1 addition & 1 deletion docker/README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
## Dockerfiles of lightseq

Pypi: for publish python package.
PyPI: for publish python package.

Tritonserver: for publish tritonserver
4 changes: 2 additions & 2 deletions docs/guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ These functions can export the configuration, embedding, encoder and decoder wei
LightSeq provides export examples of native Hugging Face BERT/BART/GPT2, Fairseq trained with LightSeq and LightSeq Transformer. All codes are available [here](../examples/inference/python/export).

#### Fairseq
The main code is as follows (some parameters are omitted). Complete code is available [here](../examples/inference/python/export/ls_fs_transformer_export.py).
The main code is as follows (some parameters are omitted). Complete code is available [here](../examples/inference/python/export/fairseq/ls_fs_transformer_export.py).
```python
model = Transformer()
encoder_state_dict, decoder_state_dict = _extract_weight(state_dict)
Expand All @@ -136,7 +136,7 @@ First, you need to divide the state dict into two parts of encoder and decoder,
The above functions export the checkpoints to protobuf by default. Specify `save_pb=False` to export to hdf5 files. You can use the [Fairseq training example](../examples/training/fairseq) to obtain the trained checkpoints.

#### Hugging Face
LightSeq provides three examples of exporting native Hugging Face models ([BERT](../examples/inference/python/export/hf_bert_export.py), [BART](../examples/inference/python/export/hf_bart_export.py) and [GPT2](../examples/inference/python/export/hf_gpt2_export.py)). Because these native models did not use LightSeq modules to pretrain, the users must manually make the export rules.
LightSeq provides three examples of exporting native Hugging Face models ([BERT](../examples/inference/python/export/huggingface/hf_bert_export.py), [BART](../examples/inference/python/export/huggingface/hf_bart_export.py) and [GPT2](../examples/inference/python/export/huggingface/hf_gpt2_export.py)). Because these native models did not use LightSeq modules to pretrain, the users must manually make the export rules.

#### LightSeq Transformer
LightSeq provide an example of exporting its own Transformer module, which is similar to Fairseq models export. You can use the [custom training example](../examples/training/custom) to obtain the trained checkpoints. This export example can also compare the results and speeds of forward propagation in training library, inference library loading both protobuf and hdf5 files. The results show that the inference library is faster than the forward propagation of training library by about 2x.
Expand Down
Binary file modified docs/training/images/single_step.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
9 changes: 9 additions & 0 deletions examples/inference/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,20 @@ cmake_minimum_required(VERSION 3.18)
add_executable(transformer_example transformer_example.cc)
target_link_libraries(transformer_example PUBLIC liblightseq)

add_executable(quant_transformer_example quant_transformer_example.cc)
target_link_libraries(quant_transformer_example PUBLIC liblightseq)

add_executable(bert_example bert_example.cc)
target_link_libraries(bert_example PUBLIC liblightseq)

add_executable(quant_bert_example quant_bert_example.cc)
target_link_libraries(quant_bert_example PUBLIC liblightseq)

add_executable(gpt_example gpt_example.cc)
target_link_libraries(gpt_example PUBLIC liblightseq)

add_executable(quant_gpt_example quant_gpt_example.cc)
target_link_libraries(quant_gpt_example PUBLIC liblightseq)

add_executable(transformer_decoder_example decoder_example.cc.cu)
target_link_libraries(transformer_decoder_example PUBLIC transformer_model)
24 changes: 20 additions & 4 deletions examples/inference/cpp/bert_example.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,31 @@ Example of how to run Bert inference using our implementation.

int main(int argc, char* argv[]) {
std::string model_weights_path = argv[1];
std::vector<int> example_input = {2859, 2758, 2051, 2157,
2005, 6629, 7566, 1012};
int eg_seq_len = example_input.size();
int max_batch_size = 128;
int batch_size = 1;
int batch_seq_len = eg_seq_len;

if (argc == 4) {
batch_size = atoi(argv[2]);
batch_seq_len = atoi(argv[3]);
}
if (batch_size > max_batch_size) {
throw std::runtime_error("batch_size exceeds the maximum (128)!");
}

std::vector<int> host_input;
for (int i = 0; i < batch_size; ++i) {
for (int j = 0; j < batch_seq_len; ++j) {
host_input.push_back(example_input[j % eg_seq_len]);
}
}

auto model = lightseq::cuda::LSModelFactory::GetInstance().CreateModel(
"Bert", model_weights_path, max_batch_size);

int batch_size = 1;
int batch_seq_len = 8;
std::vector<int> host_input = {101, 4931, 1010, 2129, 2024, 2017, 102, 0};

void* d_input;
lightseq::cuda::CHECK_GPU_ERROR(
cudaMalloc(&d_input, sizeof(int) * batch_size * batch_seq_len));
Expand Down
25 changes: 20 additions & 5 deletions examples/inference/cpp/gpt_example.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,30 @@ Example of how to run gpt inference using our implementation.

int main(int argc, char* argv[]) {
std::string model_weights_path = argv[1];
std::vector<int> example_input = {40, 1842, 345, 11, 475, 345, 910, 326};
int eg_seq_len = example_input.size();
int max_batch_size = 128;
int batch_size = 1;
int batch_seq_len = eg_seq_len;

if (argc == 4) {
batch_size = atoi(argv[2]);
batch_seq_len = atoi(argv[3]);
}
if (batch_size > max_batch_size) {
throw std::runtime_error("batch_size exceeds the maximum (128)!");
}

std::vector<int> host_input;
for (int i = 0; i < batch_size; ++i) {
for (int j = 0; j < batch_seq_len; ++j) {
host_input.push_back(example_input[j % eg_seq_len]);
}
}

auto model = lightseq::cuda::LSModelFactory::GetInstance().CreateModel(
"Gpt", model_weights_path, max_batch_size);

int batch_size = 1;
int batch_seq_len = 5;
std::vector<int> host_input = {3666, 1438, 318, 402, 11571};

void* d_input;
lightseq::cuda::CHECK_GPU_ERROR(
cudaMalloc(&d_input, sizeof(int) * batch_size * batch_seq_len));
Expand Down Expand Up @@ -58,7 +73,7 @@ int main(int argc, char* argv[]) {
}
std::cout << std::endl;

lightseq::cuda::print_vec(d_output, "output", 5);
lightseq::cuda::print_vec(d_output, "output", 10);
}

return 0;
Expand Down
81 changes: 81 additions & 0 deletions examples/inference/cpp/quant_bert_example.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#include "model_base.h"
#include "util.h"

/**
@file
Example of how to run QuantBert inference using our implementation.
*/

int main(int argc, char* argv[]) {
std::string model_weights_path = argv[1];
std::vector<int> example_input = {2859, 2758, 2051, 2157,
2005, 6629, 7566, 1012};
int eg_seq_len = example_input.size();
int max_batch_size = 128;
int batch_size = 1;
int batch_seq_len = eg_seq_len;

if (argc == 4) {
batch_size = atoi(argv[2]);
batch_seq_len = atoi(argv[3]);
}
if (batch_size > max_batch_size) {
throw std::runtime_error("batch_size exceeds the maximum (128)!");
}

std::vector<int> host_input;
for (int i = 0; i < batch_size; ++i) {
for (int j = 0; j < batch_seq_len; ++j) {
host_input.push_back(example_input[j % eg_seq_len]);
}
}

auto model = lightseq::cuda::LSModelFactory::GetInstance().CreateModel(
"QuantBert", model_weights_path, max_batch_size);

void* d_input;
lightseq::cuda::CHECK_GPU_ERROR(
cudaMalloc(&d_input, sizeof(int) * batch_size * batch_seq_len));
lightseq::cuda::CHECK_GPU_ERROR(cudaMemcpy(
d_input, host_input.data(), sizeof(int) * batch_size * batch_seq_len,
cudaMemcpyHostToDevice));

model->set_input_ptr(0, d_input);
model->set_input_shape(0, {batch_size, batch_seq_len});

for (int i = 0; i < model->get_output_size(); i++) {
void* d_output;
std::vector<int> shape = model->get_output_max_shape(i);
int total_size = 1;
for (int j = 0; j < shape.size(); j++) {
total_size *= shape[j];
}
lightseq::cuda::CHECK_GPU_ERROR(
cudaMalloc(&d_output, total_size * sizeof(int)));
model->set_output_ptr(i, d_output);
}
lightseq::cuda::CHECK_GPU_ERROR(cudaStreamSynchronize(0));
std::cout << "infer preprocessing finished" << std::endl;

/* ---step5. infer and log--- */
for (int i = 0; i < 10; i++) {
auto start = std::chrono::high_resolution_clock::now();
model->Infer();
lightseq::cuda::print_time_duration(start, "one infer time", 0);
}

for (int i = 0; i < model->get_output_size(); i++) {
const float* d_output;
d_output = static_cast<const float*>(model->get_output_ptr(i));
std::vector<int> shape = model->get_output_shape(i);
std::cout << "output shape: ";
for (int j = 0; j < shape.size(); j++) {
std::cout << shape[j] << " ";
}
std::cout << std::endl;

lightseq::cuda::print_vec(d_output, "output", 5);
}

return 0;
}
80 changes: 80 additions & 0 deletions examples/inference/cpp/quant_gpt_example.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#include "model_base.h"
#include "gpt.h"

/**
@file
Example of how to run gpt inference using our implementation.
*/

int main(int argc, char* argv[]) {
std::string model_weights_path = argv[1];
std::vector<int> example_input = {40, 1842, 345, 11, 475, 345, 910, 326};
int eg_seq_len = example_input.size();
int max_batch_size = 128;
int batch_size = 1;
int batch_seq_len = eg_seq_len;

if (argc == 4) {
batch_size = atoi(argv[2]);
batch_seq_len = atoi(argv[3]);
}
if (batch_size > max_batch_size) {
throw std::runtime_error("batch_size exceeds the maximum (128)!");
}

std::vector<int> host_input;
for (int i = 0; i < batch_size; ++i) {
for (int j = 0; j < batch_seq_len; ++j) {
host_input.push_back(example_input[j % eg_seq_len]);
}
}

auto model = lightseq::cuda::LSModelFactory::GetInstance().CreateModel(
"QuantGpt", model_weights_path, max_batch_size);

void* d_input;
lightseq::cuda::CHECK_GPU_ERROR(
cudaMalloc(&d_input, sizeof(int) * batch_size * batch_seq_len));
lightseq::cuda::CHECK_GPU_ERROR(cudaMemcpy(
d_input, host_input.data(), sizeof(int) * batch_size * batch_seq_len,
cudaMemcpyHostToDevice));

model->set_input_ptr(0, d_input);
model->set_input_shape(0, {batch_size, batch_seq_len});

for (int i = 0; i < model->get_output_size(); i++) {
void* d_output;
std::vector<int> shape = model->get_output_max_shape(i);
int total_size = 1;
for (int j = 0; j < shape.size(); j++) {
total_size *= shape[j];
}
lightseq::cuda::CHECK_GPU_ERROR(
cudaMalloc(&d_output, total_size * sizeof(int)));
model->set_output_ptr(i, d_output);
}
lightseq::cuda::CHECK_GPU_ERROR(cudaStreamSynchronize(0));
std::cout << "infer preprocessing finished" << std::endl;

/* ---step5. infer and log--- */
for (int i = 0; i < 10; i++) {
auto start = std::chrono::high_resolution_clock::now();
model->Infer();
lightseq::cuda::print_time_duration(start, "one infer time", 0);
}

for (int i = 0; i < model->get_output_size(); i++) {
const int* d_output;
d_output = static_cast<const int*>(model->get_output_ptr(i));
std::vector<int> shape = model->get_output_shape(i);
std::cout << "output shape: ";
for (int j = 0; j < shape.size(); j++) {
std::cout << shape[j] << " ";
}
std::cout << std::endl;

lightseq::cuda::print_vec(d_output, "output", 10);
}

return 0;
}
Loading