diff --git a/README.md b/README.md
index e351df87..f5a5a20b 100644
--- a/README.md
+++ b/README.md
@@ -41,7 +41,7 @@ The following is a support matrix of LightSeq **inference** library compared wit
 ## Performance
 
 ### [>>> Training](./lightseq/training)
-Here we present the experimental results on WMT14 English to German translation task based on Transformer-big models. We train Transformer models of different sizes on eight NVIDIA Tesla V100/NVIDIA Ampere A100 GPUs with data parallel and fp16 mixed precision.
+Here we present the experimental results on WMT14 English to German translation task based on Transformer-big models. We train Transformer models of different sizes on eight NVIDIA Tesla V100/NVIDIA Tesla A100 GPUs with data parallel and fp16 mixed precision.
 [Fairseq](https://github.com/pytorch/fairseq) with [Apex](https://github.com/NVIDIA/apex) is choosed as our baseline.
 
 <img src="./docs/training/images/single_step.png"  width="80%" aligned="middle">
@@ -66,6 +66,20 @@ More results is available [here](./docs/inference/performance.md).
 ## Quick Start
 Complete user guide is available [here](docs/guide.md).
 
+### Installation
+You can install LightSeq from PyPI:
+```shell
+$ pip install lightseq
+```
+
+LightSeq installation from PyPI only supports Python 3.6 to 3.8 on Linux for now. Consider compiling from source if you have other environments:
+```shell
+$ PATH=/usr/local/hdf5/:$PATH ENABLE_FP32=0 ENABLE_DEBUG=0 pip install -e $PROJECT_DIR
+```
+
+Detailed building introduction is available [here](docs/inference/build.md).
+
+
 ### Fast training from Fairseq
 
 You can experience lightning fast training by running following commands,
@@ -97,12 +111,10 @@ $ cd examples/inference/python
 then you can check the performance by simply running following commands. `hf_bart_export.py` is used to transform pytorch weights to LightSeq protobuffer.
 
 ```shell
-$ python export/hf_bart_export.py
+$ python export/huggingface/hf_bart_export.py
 $ python test/ls_bart.py
 ```
 
-LightSeq installation from pypi only supports python 3.6 to 3.8 on Linux for now. Consider compiling from source if you have other environments.
-
 More usage is available [here](./lightseq/inference/README.md).
 
 ### Fast deploy inference server
diff --git a/docker/README.md b/docker/README.md
index f29df5c6..375f5f4e 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -1,5 +1,5 @@
 ## Dockerfiles of lightseq
 
-Pypi: for publish python package.
+PyPI: for publish python package.
 
 Tritonserver: for publish tritonserver
diff --git a/docs/guide.md b/docs/guide.md
index 1fd427c3..651cc616 100644
--- a/docs/guide.md
+++ b/docs/guide.md
@@ -119,7 +119,7 @@ These functions can export the configuration, embedding, encoder and decoder wei
 LightSeq provides export examples of native Hugging Face BERT/BART/GPT2, Fairseq trained with LightSeq and LightSeq Transformer. All codes are available [here](../examples/inference/python/export).
 
 #### Fairseq
-The main code is as follows (some parameters are omitted). Complete code is available [here](../examples/inference/python/export/ls_fs_transformer_export.py).
+The main code is as follows (some parameters are omitted). Complete code is available [here](../examples/inference/python/export/fairseq/ls_fs_transformer_export.py).
 ```python
 model = Transformer()
 encoder_state_dict, decoder_state_dict = _extract_weight(state_dict)
@@ -136,7 +136,7 @@ First, you need to divide the state dict into two parts of encoder and decoder,
 The above functions export the checkpoints to protobuf by default. Specify `save_pb=False` to export to hdf5 files. You can use the [Fairseq training example](../examples/training/fairseq) to obtain the trained checkpoints.
 
 #### Hugging Face
-LightSeq provides three examples of exporting native Hugging Face models ([BERT](../examples/inference/python/export/hf_bert_export.py), [BART](../examples/inference/python/export/hf_bart_export.py) and [GPT2](../examples/inference/python/export/hf_gpt2_export.py)). Because these native models did not use LightSeq modules to pretrain, the users must manually make the export rules.
+LightSeq provides three examples of exporting native Hugging Face models ([BERT](../examples/inference/python/export/huggingface/hf_bert_export.py), [BART](../examples/inference/python/export/huggingface/hf_bart_export.py) and [GPT2](../examples/inference/python/export/huggingface/hf_gpt2_export.py)). Because these native models did not use LightSeq modules to pretrain, the users must manually make the export rules.
 
 #### LightSeq Transformer
 LightSeq provide an example of exporting its own Transformer module, which is similar to Fairseq models export. You can use the [custom training example](../examples/training/custom) to obtain the trained checkpoints. This export example can also compare the results and speeds of forward propagation in training library, inference library loading both protobuf and hdf5 files. The results show that the inference library is faster than the forward propagation of training library by about 2x.
diff --git a/docs/training/images/single_step.png b/docs/training/images/single_step.png
index aae28f40..ea79e34c 100644
Binary files a/docs/training/images/single_step.png and b/docs/training/images/single_step.png differ
diff --git a/examples/inference/cpp/CMakeLists.txt b/examples/inference/cpp/CMakeLists.txt
index 64cec769..dbf92330 100644
--- a/examples/inference/cpp/CMakeLists.txt
+++ b/examples/inference/cpp/CMakeLists.txt
@@ -3,11 +3,20 @@ cmake_minimum_required(VERSION 3.18)
 add_executable(transformer_example transformer_example.cc)
 target_link_libraries(transformer_example PUBLIC liblightseq)
 
+add_executable(quant_transformer_example quant_transformer_example.cc)
+target_link_libraries(quant_transformer_example PUBLIC liblightseq)
+
 add_executable(bert_example bert_example.cc)
 target_link_libraries(bert_example PUBLIC liblightseq)
 
+add_executable(quant_bert_example quant_bert_example.cc)
+target_link_libraries(quant_bert_example PUBLIC liblightseq)
+
 add_executable(gpt_example gpt_example.cc)
 target_link_libraries(gpt_example PUBLIC liblightseq)
 
+add_executable(quant_gpt_example quant_gpt_example.cc)
+target_link_libraries(quant_gpt_example PUBLIC liblightseq)
+
 add_executable(transformer_decoder_example decoder_example.cc.cu)
 target_link_libraries(transformer_decoder_example PUBLIC transformer_model)
diff --git a/examples/inference/cpp/bert_example.cc b/examples/inference/cpp/bert_example.cc
index cdec69a1..22c08bb7 100644
--- a/examples/inference/cpp/bert_example.cc
+++ b/examples/inference/cpp/bert_example.cc
@@ -8,15 +8,31 @@ Example of how to run Bert inference using our implementation.
 
 int main(int argc, char* argv[]) {
   std::string model_weights_path = argv[1];
+  std::vector<int> example_input = {2859, 2758, 2051, 2157,
+                                    2005, 6629, 7566, 1012};
+  int eg_seq_len = example_input.size();
   int max_batch_size = 128;
+  int batch_size = 1;
+  int batch_seq_len = eg_seq_len;
+
+  if (argc == 4) {
+    batch_size = atoi(argv[2]);
+    batch_seq_len = atoi(argv[3]);
+  }
+  if (batch_size > max_batch_size) {
+    throw std::runtime_error("batch_size exceeds the maximum (128)!");
+  }
+
+  std::vector<int> host_input;
+  for (int i = 0; i < batch_size; ++i) {
+    for (int j = 0; j < batch_seq_len; ++j) {
+      host_input.push_back(example_input[j % eg_seq_len]);
+    }
+  }
 
   auto model = lightseq::cuda::LSModelFactory::GetInstance().CreateModel(
       "Bert", model_weights_path, max_batch_size);
 
-  int batch_size = 1;
-  int batch_seq_len = 8;
-  std::vector<int> host_input = {101, 4931, 1010, 2129, 2024, 2017, 102, 0};
-
   void* d_input;
   lightseq::cuda::CHECK_GPU_ERROR(
       cudaMalloc(&d_input, sizeof(int) * batch_size * batch_seq_len));
diff --git a/examples/inference/cpp/gpt_example.cc b/examples/inference/cpp/gpt_example.cc
index c1defe1a..bc07d90e 100644
--- a/examples/inference/cpp/gpt_example.cc
+++ b/examples/inference/cpp/gpt_example.cc
@@ -8,15 +8,30 @@ Example of how to run gpt inference using our implementation.
 
 int main(int argc, char* argv[]) {
   std::string model_weights_path = argv[1];
+  std::vector<int> example_input = {40, 1842, 345, 11, 475, 345, 910, 326};
+  int eg_seq_len = example_input.size();
   int max_batch_size = 128;
+  int batch_size = 1;
+  int batch_seq_len = eg_seq_len;
+
+  if (argc == 4) {
+    batch_size = atoi(argv[2]);
+    batch_seq_len = atoi(argv[3]);
+  }
+  if (batch_size > max_batch_size) {
+    throw std::runtime_error("batch_size exceeds the maximum (128)!");
+  }
+
+  std::vector<int> host_input;
+  for (int i = 0; i < batch_size; ++i) {
+    for (int j = 0; j < batch_seq_len; ++j) {
+      host_input.push_back(example_input[j % eg_seq_len]);
+    }
+  }
 
   auto model = lightseq::cuda::LSModelFactory::GetInstance().CreateModel(
       "Gpt", model_weights_path, max_batch_size);
 
-  int batch_size = 1;
-  int batch_seq_len = 5;
-  std::vector<int> host_input = {3666, 1438, 318, 402, 11571};
-
   void* d_input;
   lightseq::cuda::CHECK_GPU_ERROR(
       cudaMalloc(&d_input, sizeof(int) * batch_size * batch_seq_len));
@@ -58,7 +73,7 @@ int main(int argc, char* argv[]) {
     }
     std::cout << std::endl;
 
-    lightseq::cuda::print_vec(d_output, "output", 5);
+    lightseq::cuda::print_vec(d_output, "output", 10);
   }
 
   return 0;
diff --git a/examples/inference/cpp/quant_bert_example.cc b/examples/inference/cpp/quant_bert_example.cc
new file mode 100644
index 00000000..54ff5c14
--- /dev/null
+++ b/examples/inference/cpp/quant_bert_example.cc
@@ -0,0 +1,81 @@
+#include "model_base.h"
+#include "util.h"
+
+/**
+@file
+Example of how to run QuantBert inference using our implementation.
+*/
+
+int main(int argc, char* argv[]) {
+  std::string model_weights_path = argv[1];
+  std::vector<int> example_input = {2859, 2758, 2051, 2157,
+                                    2005, 6629, 7566, 1012};
+  int eg_seq_len = example_input.size();
+  int max_batch_size = 128;
+  int batch_size = 1;
+  int batch_seq_len = eg_seq_len;
+
+  if (argc == 4) {
+    batch_size = atoi(argv[2]);
+    batch_seq_len = atoi(argv[3]);
+  }
+  if (batch_size > max_batch_size) {
+    throw std::runtime_error("batch_size exceeds the maximum (128)!");
+  }
+
+  std::vector<int> host_input;
+  for (int i = 0; i < batch_size; ++i) {
+    for (int j = 0; j < batch_seq_len; ++j) {
+      host_input.push_back(example_input[j % eg_seq_len]);
+    }
+  }
+
+  auto model = lightseq::cuda::LSModelFactory::GetInstance().CreateModel(
+      "QuantBert", model_weights_path, max_batch_size);
+
+  void* d_input;
+  lightseq::cuda::CHECK_GPU_ERROR(
+      cudaMalloc(&d_input, sizeof(int) * batch_size * batch_seq_len));
+  lightseq::cuda::CHECK_GPU_ERROR(cudaMemcpy(
+      d_input, host_input.data(), sizeof(int) * batch_size * batch_seq_len,
+      cudaMemcpyHostToDevice));
+
+  model->set_input_ptr(0, d_input);
+  model->set_input_shape(0, {batch_size, batch_seq_len});
+
+  for (int i = 0; i < model->get_output_size(); i++) {
+    void* d_output;
+    std::vector<int> shape = model->get_output_max_shape(i);
+    int total_size = 1;
+    for (int j = 0; j < shape.size(); j++) {
+      total_size *= shape[j];
+    }
+    lightseq::cuda::CHECK_GPU_ERROR(
+        cudaMalloc(&d_output, total_size * sizeof(int)));
+    model->set_output_ptr(i, d_output);
+  }
+  lightseq::cuda::CHECK_GPU_ERROR(cudaStreamSynchronize(0));
+  std::cout << "infer preprocessing finished" << std::endl;
+
+  /* ---step5. infer and log--- */
+  for (int i = 0; i < 10; i++) {
+    auto start = std::chrono::high_resolution_clock::now();
+    model->Infer();
+    lightseq::cuda::print_time_duration(start, "one infer time", 0);
+  }
+
+  for (int i = 0; i < model->get_output_size(); i++) {
+    const float* d_output;
+    d_output = static_cast<const float*>(model->get_output_ptr(i));
+    std::vector<int> shape = model->get_output_shape(i);
+    std::cout << "output shape: ";
+    for (int j = 0; j < shape.size(); j++) {
+      std::cout << shape[j] << " ";
+    }
+    std::cout << std::endl;
+
+    lightseq::cuda::print_vec(d_output, "output", 5);
+  }
+
+  return 0;
+}
diff --git a/examples/inference/cpp/quant_gpt_example.cc b/examples/inference/cpp/quant_gpt_example.cc
new file mode 100644
index 00000000..6a3dce42
--- /dev/null
+++ b/examples/inference/cpp/quant_gpt_example.cc
@@ -0,0 +1,80 @@
+#include "model_base.h"
+#include "gpt.h"
+
+/**
+@file
+Example of how to run gpt inference using our implementation.
+*/
+
+int main(int argc, char* argv[]) {
+  std::string model_weights_path = argv[1];
+  std::vector<int> example_input = {40, 1842, 345, 11, 475, 345, 910, 326};
+  int eg_seq_len = example_input.size();
+  int max_batch_size = 128;
+  int batch_size = 1;
+  int batch_seq_len = eg_seq_len;
+
+  if (argc == 4) {
+    batch_size = atoi(argv[2]);
+    batch_seq_len = atoi(argv[3]);
+  }
+  if (batch_size > max_batch_size) {
+    throw std::runtime_error("batch_size exceeds the maximum (128)!");
+  }
+
+  std::vector<int> host_input;
+  for (int i = 0; i < batch_size; ++i) {
+    for (int j = 0; j < batch_seq_len; ++j) {
+      host_input.push_back(example_input[j % eg_seq_len]);
+    }
+  }
+
+  auto model = lightseq::cuda::LSModelFactory::GetInstance().CreateModel(
+      "QuantGpt", model_weights_path, max_batch_size);
+
+  void* d_input;
+  lightseq::cuda::CHECK_GPU_ERROR(
+      cudaMalloc(&d_input, sizeof(int) * batch_size * batch_seq_len));
+  lightseq::cuda::CHECK_GPU_ERROR(cudaMemcpy(
+      d_input, host_input.data(), sizeof(int) * batch_size * batch_seq_len,
+      cudaMemcpyHostToDevice));
+
+  model->set_input_ptr(0, d_input);
+  model->set_input_shape(0, {batch_size, batch_seq_len});
+
+  for (int i = 0; i < model->get_output_size(); i++) {
+    void* d_output;
+    std::vector<int> shape = model->get_output_max_shape(i);
+    int total_size = 1;
+    for (int j = 0; j < shape.size(); j++) {
+      total_size *= shape[j];
+    }
+    lightseq::cuda::CHECK_GPU_ERROR(
+        cudaMalloc(&d_output, total_size * sizeof(int)));
+    model->set_output_ptr(i, d_output);
+  }
+  lightseq::cuda::CHECK_GPU_ERROR(cudaStreamSynchronize(0));
+  std::cout << "infer preprocessing finished" << std::endl;
+
+  /* ---step5. infer and log--- */
+  for (int i = 0; i < 10; i++) {
+    auto start = std::chrono::high_resolution_clock::now();
+    model->Infer();
+    lightseq::cuda::print_time_duration(start, "one infer time", 0);
+  }
+
+  for (int i = 0; i < model->get_output_size(); i++) {
+    const int* d_output;
+    d_output = static_cast<const int*>(model->get_output_ptr(i));
+    std::vector<int> shape = model->get_output_shape(i);
+    std::cout << "output shape: ";
+    for (int j = 0; j < shape.size(); j++) {
+      std::cout << shape[j] << " ";
+    }
+    std::cout << std::endl;
+
+    lightseq::cuda::print_vec(d_output, "output", 10);
+  }
+
+  return 0;
+}
diff --git a/examples/inference/cpp/quant_transformer_example.cc b/examples/inference/cpp/quant_transformer_example.cc
new file mode 100644
index 00000000..4073b8a3
--- /dev/null
+++ b/examples/inference/cpp/quant_transformer_example.cc
@@ -0,0 +1,88 @@
+#include "model_base.h"
+#include "util.h"
+
+/**
+@file
+Example of how to run quantized transformer inference using our implementation.
+*/
+
+int main(int argc, char* argv[]) {
+  std::string model_weights_path = argv[1];
+
+  std::vector<int> example_input = {63, 47,   65,  1507, 88,  74,
+                                    10, 2057, 362, 9,    284, 6};
+  int eg_seq_len = example_input.size();
+  int max_batch_size = 128;
+  int batch_size = 1;
+  int batch_seq_len = eg_seq_len;
+
+  if (argc == 4) {
+    batch_size = atoi(argv[2]);
+    batch_seq_len = atoi(argv[3]);
+  }
+  if (batch_size > max_batch_size) {
+    throw std::runtime_error("batch_size exceeds the maximum (128)!");
+  }
+
+  std::vector<int> host_input;
+  for (int i = 0; i < batch_size; ++i) {
+    for (int j = 0; j < batch_seq_len; ++j) {
+      host_input.push_back(example_input[j % eg_seq_len]);
+    }
+  }
+
+  auto model = lightseq::cuda::LSModelFactory::GetInstance().CreateModel(
+      "QuantTransformer", model_weights_path, max_batch_size);
+
+  void* d_input;
+  lightseq::cuda::CHECK_GPU_ERROR(
+      cudaMalloc(&d_input, sizeof(int) * batch_size * batch_seq_len));
+  lightseq::cuda::CHECK_GPU_ERROR(cudaMemcpy(
+      d_input, host_input.data(), sizeof(int) * batch_size * batch_seq_len,
+      cudaMemcpyHostToDevice));
+
+  model->set_input_ptr(0, d_input);
+  model->set_input_shape(0, {batch_size, batch_seq_len});
+
+  for (int i = 0; i < model->get_output_size(); i++) {
+    void* d_output;
+    std::vector<int> shape = model->get_output_max_shape(i);
+    int total_size = 1;
+    for (int j = 0; j < shape.size(); j++) {
+      total_size *= shape[j];
+    }
+    lightseq::cuda::CHECK_GPU_ERROR(
+        cudaMalloc(&d_output, total_size * sizeof(int)));
+    model->set_output_ptr(i, d_output);
+  }
+  lightseq::cuda::CHECK_GPU_ERROR(cudaStreamSynchronize(0));
+  std::cout << "infer preprocessing finished" << std::endl;
+
+  /* ---step5. infer and log--- */
+  for (int i = 0; i < 20; i++) {
+    auto start = std::chrono::high_resolution_clock::now();
+    model->Infer();
+    lightseq::cuda::print_time_duration(start, "one infer time", 0);
+  }
+
+  for (int i = 0; i < model->get_output_size(); i++) {
+    const void* d_output;
+    d_output = static_cast<const float*>(model->get_output_ptr(i));
+    std::vector<int> shape = model->get_output_shape(i);
+    std::cout << "output shape: ";
+    for (int j = 0; j < shape.size(); j++) {
+      std::cout << shape[j] << " ";
+    }
+    std::cout << std::endl;
+
+    if (!i)
+      lightseq::cuda::print_vec((int*)d_output, "output", 15);
+    else
+      lightseq::cuda::print_vec((float*)d_output, "output", 5);
+  }
+
+  // const int* res = model.get_result_ptr();
+  // const float* res_score = model.get_score_ptr();
+  // lightseq::cuda::print_vec(res_score, "res score", 5);
+  return 0;
+}
diff --git a/examples/inference/cpp/transformer_example.cc b/examples/inference/cpp/transformer_example.cc
index 6998064a..68f2f101 100644
--- a/examples/inference/cpp/transformer_example.cc
+++ b/examples/inference/cpp/transformer_example.cc
@@ -8,16 +8,32 @@ Example of how to run transformer inference using our implementation.
 
 int main(int argc, char* argv[]) {
   std::string model_weights_path = argv[1];
+
+  std::vector<int> example_input = {63, 47,   65,  1507, 88,  74,
+                                    10, 2057, 362, 9,    284, 6};
+  int eg_seq_len = example_input.size();
   int max_batch_size = 128;
+  int batch_size = 1;
+  int batch_seq_len = eg_seq_len;
+
+  if (argc == 4) {
+    batch_size = atoi(argv[2]);
+    batch_seq_len = atoi(argv[3]);
+  }
+  if (batch_size > max_batch_size) {
+    throw std::runtime_error("batch_size exceeds the maximum (128)!");
+  }
+
+  std::vector<int> host_input;
+  for (int i = 0; i < batch_size; ++i) {
+    for (int j = 0; j < batch_seq_len; ++j) {
+      host_input.push_back(example_input[j % eg_seq_len]);
+    }
+  }
 
   auto model = lightseq::cuda::LSModelFactory::GetInstance().CreateModel(
       "Transformer", model_weights_path, max_batch_size);
 
-  int batch_size = 1;
-  int batch_seq_len = 14;
-  std::vector<int> host_input = {0,     100, 657, 14,    1816, 6, 53,
-                                 50264, 473, 45,  50264, 162,  4, 2};
-
   void* d_input;
   lightseq::cuda::CHECK_GPU_ERROR(
       cudaMalloc(&d_input, sizeof(int) * batch_size * batch_seq_len));
@@ -43,14 +59,14 @@ int main(int argc, char* argv[]) {
   std::cout << "infer preprocessing finished" << std::endl;
 
   /* ---step5. infer and log--- */
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < 20; i++) {
     auto start = std::chrono::high_resolution_clock::now();
     model->Infer();
     lightseq::cuda::print_time_duration(start, "one infer time", 0);
   }
 
   for (int i = 0; i < model->get_output_size(); i++) {
-    const float* d_output;
+    const void* d_output;
     d_output = static_cast<const float*>(model->get_output_ptr(i));
     std::vector<int> shape = model->get_output_shape(i);
     std::cout << "output shape: ";
@@ -59,7 +75,10 @@ int main(int argc, char* argv[]) {
     }
     std::cout << std::endl;
 
-    lightseq::cuda::print_vec(d_output, "output", 5);
+    if (!i)
+      lightseq::cuda::print_vec((int*)d_output, "output", 15);
+    else
+      lightseq::cuda::print_vec((float*)d_output, "output", 5);
   }
 
   // const int* res = model.get_result_ptr();
diff --git a/examples/inference/python/README.md b/examples/inference/python/README.md
index 43c56aed..da721458 100644
--- a/examples/inference/python/README.md
+++ b/examples/inference/python/README.md
@@ -1,129 +1,62 @@
-# Examples of exporting models for LightSeq inference
-
-## Switch to the current directory
-```shell
-cd examples/inference/python
-```
-
-## Export models
-### Hugging Face
-1. Hugging Face BART
-
-Export Hugging Face BART models to protobuf/hdf5 format.
-```shell
-python export/huggingface/hf_bart_export.py
-```
-2. Hugging Face BERT
-
-Export Hugging Face BERT models to hdf5 format.
-```shell
-python export/huggingface/hf_bert_export.py
-```
-3. Hugging Face GPT2
-
-Export Hugging Face GPT2 models to hdf5 format.
-```shell
-python export/huggingface/hf_gpt2_export.py
-```
-4. Hugging Face ViT
-
-Export Hugging Face ViT models to hdf5 format.
-```shell
-python export/huggingface/hf_vit_export.py
-```
-### Native Fairseq
-1. Native Fairseq Transformer
-
-Export native Fairseq Transformer models to protobuf/hdf5 format. Refer to the `examples/training/fairseq` directory for more training details.
-```shell
-python export/fairseq/native_fs_transformer_export.py -m checkpoint_best.pt
-```
-
-2. Native Fairseq Transformer using PTQ
-
-Export native Fairseq Transformer models using PTQ to protobuf/hdf5 format. Refer to the `examples/training/fairseq` directory for more training details.
-```shell
-python export/fairseq/native_fs_transformer_export.py -m checkpoint_best.pt
-```
-
-3. Native Fairseq MoE Transformer
-
-Export Fairseq MoE models to protobuf/hdf5 format.
-```shell
-python export/fairseq/fs_moe_export.py
-```
-
-### Fairseq Transformer + LightSeq
-1. Fairseq Transformer using LightSeq training library
-
-Export Fairseq Transformer models training with LightSeq to protobuf/hdf5 format. Refer to the `examples/training/fairseq` directory for more training details.
-```shell
-python export/fairseq/ls_fs_transformer_export.py -m checkpoint_best.pt
-```
-
-2. Fairseq Transformer using LightSeq training library with PTQ
-
-Export Fairseq Transformer models training with LightSeq to protobuf format, and then using PTQ to speedup inference. Refer to the `examples/training/fairseq` directory for more training details.
-```shell
-python export/fairseq/ls_fs_transformer_ptq_export.py -m checkpoint_best.pt
-```
-
-### LightSeq Transformer
-
-1. LightSeq Transformer
-
-Export LightSeq Transformer models to protobuf/hdf5 format. Refer to the `examples/training/custom` directory for more training details.
-```shell
-python export/ls_transformer_export.py
-```
-2. LightSeq Transformer using PTQ
-
-Export LightSeq fp16/fp32 Transformer models to int8 protobuf format, and then using PTQ to speedup inference. Refer to the `examples/training/custom` directory for more training details. Note that in this example, we do not need to finetune the models using fake-quantization.
-```shell
-python export/ls_transformer_ptq_export.py
-```
-
-### Fairseq Transformer + custom Torch layers
-1. Fairseq Transformer using custom Torch layers
-
-Export Fairseq Transformer models training using custom Torch layers to protobuf/hdf5 format. Refer to the `examples/training/fairseq` directory for more training details.
-```shell
-python export/fairseq/ls_torch_fs_transformer_export.py -m checkpoint_best.pt
-```
-
-2. Fairseq Transformer using custom Torch layers and PTQ
-
-Export PTQ Fairseq Transformer models training using custom Torch layers to protobuf/hdf5 format. Refer to the `examples/training/fairseq` directory for more training details.
-```shell
-python export/fairseq/ls_torch_fs_transformer_ptq_export.py -m checkpoint_best.pt
-```
-
-3. Quantized Fairseq Transformer using custom Torch layers
-
-Export quantized Fairseq Transformer models training using custom Torch layers to protobuf/hdf5 format. Refer to the `examples/training/fairseq` directory for more training details.
-```shell
-python export/fairseq/ls_torch_fs_quant_transformer_export.py -m checkpoint_best.pt
-```
-
-## Inference using LightSeq
+# Model export and LightSeq inference
+This repo contains examples of exporting models (LightSeq, Fairseq based, Hugging Face, etc.) to protobuf/hdf5 format, and then use LightSeq for fast inference. For each model, we provide normal float model export, quantized model export (QAT, quantization aware training) and PTQ (post training quantization) model export.
+
+Before doing anything, you need to switch to the current directory:
+```shell
+$ cd examples/inference/python
+```
+
+## Model export
+We provide the following export examples. All Fairseq based models are trained using the scripts in [examples/training/fairseq](../../../examples/training/fairseq). The first two LightSeq Transformer models are trained using the scripts in [examples/training/custom](../../../examples/training/custom).
+
+| Model                                        | Type  | Command                                                                                               | Resource                                                                                                                             | Description                                                                                                                                              |
+|----------------------------------------------|-------|-------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------|
+| LightSeq Transformer                         | Float | python export/ls_transformer_export.py -m ckpt_ls_custom.pt                                           | [link](http://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/lightseq/example_model/ckpt_ls_custom.pt)                            | Export LightSeq Transformer models to protobuf format.                                                                                                   |
+| LightSeq Transformer + PTQ                   | Int8  | python export/ls_transformer_ptq_export.py -m ckpt_ls_custom.pt                                       | [link](http://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/lightseq/example_model/ckpt_ls_custom.pt)                            | Export LightSeq Transformer models to int8 protobuf format using post training quantization.                                                             |
+| Hugging Face BART                            | Float | python export/huggingface/hf_bart_export.py                                                           | /                                                                                                                                    | Export Hugging Face BART models to protobuf/hdf5 format.                                                                                                 |
+| Hugging Face BERT                            | Float | python export/huggingface/hf_bert_export.py                                                           | /                                                                                                                                    | Export Hugging Face BERT models to hdf5 format.                                                                                                          |
+| Hugging Face + custom Torch layer BERT + QAT | Int8  | python export/huggingface/ls_torch_hf_quant_bert_export.py -m ckpt_ls_torch_hf_quant_bert_ner.bin     | /                                                                                                                                    | Export Hugging Face BERT training with custom Torch layers to hdf5 format.                                                                               |
+| Hugging Face GPT2                            | Float | python export/huggingface/hf_gpt2_export.py                                                           | /                                                                                                                                    | Export Hugging Face GPT2 models to hdf5 format.                                                                                                          |
+| Hugging Face + custom Torch layer GPT2 + QAT | Int8  | python export/huggingface/ls_torch_hf_quant_gpt2_export.py -m ckpt_ls_torch_hf_quant_gpt2_ner.bin     | /                                                                                                                                    | Export Hugging Face GPT2 training with custom Torch layers to hdf5 format.                                                                               |
+| Hugging Face ViT                             | Float | python export/huggingface/hf_vit_export.py                                                            | /                                                                                                                                    | Export Hugging Face ViT models to hdf5 format.                                                                                                           |
+| Native Fairseq Transformer                   | Float | python export/fairseq/native_fs_transformer_export.py -m ckpt_native_fairseq_31.06.pt                 | [link](http://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/lightseq/example_model/fairseq/ckpt_native_fairseq_31.06.pt)         | Export native Fairseq Transformer models to protobuf/hdf5 format.                                                                                        |
+| Native Fairseq Transformer + PTQ             | Int8  | python export/fairseq/native_fs_transformer_export.py -m ckpt_native_fairseq_31.06.pt                 | [link](http://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/lightseq/example_model/fairseq/ckpt_native_fairseq_31.06.pt)         | Export native Fairseq Transformer models to int8 protobuf format using post training quantization.                                                       |
+| Fairseq + LightSeq Transformer               | Float | python export/fairseq/ls_fs_transformer_export.py -m ckpt_ls_fairseq_31.17.pt                         | [link](http://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/lightseq/example_model/fairseq/ckpt_ls_fairseq_31.17.pt)             | Export Fairseq Transformer models training with LightSeq modules to protobuf/hdf5 format.                                                                |
+| Fairseq + LightSeq Transformer + PTQ         | Int8  | python export/fairseq/ls_fs_transformer_ptq_export.py -m ckpt_ls_fairseq_31.17.pt                     | [link](http://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/lightseq/example_model/fairseq/ckpt_ls_fairseq_31.17.pt)             | Export Fairseq Transformer models training with LightSeq modules to int8 protobuf format using post training quantization.                               |
+| Fairseq + custom Torch layer                 | Float | python export/fairseq/ls_torch_fs_transformer_export.py -m ckpt_ls_torch_fairseq_31.16.pt             | [link](http://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/lightseq/example_model/fairseq/ckpt_ls_torch_fairseq_31.16.pt)       | Export Fairseq Transformer models training with custom Torch layers and other LightSeq modules to protobuf format.                                       |
+| Fairseq + custom Torch layer + PTQ           | Int8  | python export/fairseq/ls_torch_fs_transformer_ptq_export.py -m ckpt_ls_torch_fairseq_31.16.pt         | [link](http://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/lightseq/example_model/fairseq/ckpt_ls_torch_fairseq_31.16.pt)       | Export Fairseq Transformer models training with custom Torch layers and other LightSeq modules to int8 protobuf format using post training quantization. |
+| Fairseq + custom Torch layer + QAT           | Int8  | python export/fairseq/ls_torch_fs_quant_transformer_export.py -m ckpt_ls_torch_fairseq_quant_31.09.pt | [link](http://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/lightseq/example_model/fairseq/ckpt_ls_torch_fairseq_quant_31.09.pt) | Export quantized Fairseq Transformer models training with custom Torch layers and other LightSeq modules to int8 protobuf format.                        |
+| Native Fairseq MoE Transformer               | Float | python export/fairseq/native_fs_moe_transformer_export.py                                             | /                                                                                                                                    | Export Fairseq MoE Transformer models to protobuf/hdf5 format.                                                                                           |
+
+## LightSeq inference
+### Hugging Face models
 1. BART
 ```shell
-python test/ls_bart.py
+$ python test/ls_bart.py
 ```
 2. BERT
 ```shell
-python test/ls_bert.py
+$ python test/ls_bert.py
 ```
 3. GPT2
 ```shell
-python test/ls_gpt2.py
+$ python test/ls_gpt2.py
 ```
 4. ViT
 ```shell
-python test/ls_vit.py
+$ python test/ls_vit.py
+```
+5. Quantized BERT
+```shell
+$ python test/ls_quant_bert.py
+```
+6. Quantized GPT2
+```shell
+$ python test/ls_quant_gpt.py
 ```
 
-5. Fairseq based models using LightSeq inference
+### Fairseq based models
+After exporting the Fairseq based models to protobuf/hdf5 format using above scripts, we can use the following script for fast LightSeq inference on wmt14 en2de dateset, compatible with fp16 and int8 models:
 ```shell
-bash test/ls_fairseq.sh --model ${model_path}
+$ bash test/ls_fairseq.sh --model ${model_path}
 ```
diff --git a/examples/inference/python/export/fairseq/ls_fs_transformer_export.py b/examples/inference/python/export/fairseq/ls_fs_transformer_export.py
index 5993f79a..1b86e7d8 100644
--- a/examples/inference/python/export/fairseq/ls_fs_transformer_export.py
+++ b/examples/inference/python/export/fairseq/ls_fs_transformer_export.py
@@ -1,10 +1,8 @@
 """
-Export Fairseq Transformer models training with LightSeq to protobuf/hdf5 format.
+Export Fairseq Transformer models training with LightSeq modules to protobuf/hdf5 format.
 Refer to the `examples/training/fairseq` directory for more training details.
 """
-import argparse
 import torch
-import h5py
 from export.proto.transformer_pb2 import Transformer
 from lightseq.training import (
     export_ls_config,
@@ -13,6 +11,7 @@
     export_ls_decoder,
 )
 import lightseq.inference as lsi
+from export.util import parse_args, save_model
 
 
 def _extract_weight(state_dict):
@@ -26,7 +25,7 @@ def _extract_weight(state_dict):
     return encoder_state_dict, decoder_state_dict
 
 
-def export_fs_weights(file, state_dict, save_pb=True):
+def export_fs_weights(transformer, state_dict):
     enc_norm_w = state_dict["encoder.layer_norm.weight"].flatten().tolist()
     enc_norm_b = state_dict["encoder.layer_norm.bias"].flatten().tolist()
     dec_norm_w = state_dict["decoder.layer_norm.weight"].flatten().tolist()
@@ -36,78 +35,52 @@ def export_fs_weights(file, state_dict, save_pb=True):
         .flatten()
         .tolist()
     )
-    if save_pb:
-        file.src_embedding.norm_scale[:] = enc_norm_w
-        file.src_embedding.norm_bias[:] = enc_norm_b
-        file.trg_embedding.norm_scale[:] = dec_norm_w
-        file.trg_embedding.norm_bias[:] = dec_norm_b
-        file.trg_embedding.shared_bias[:] = dec_shared_b
-    else:
-        file.create_dataset("src_embedding/norm_scale", data=enc_norm_w, dtype="f4")
-        file.create_dataset("src_embedding/norm_bias", data=enc_norm_b, dtype="f4")
-        file.create_dataset("trg_embedding/norm_scale", data=dec_norm_w, dtype="f4")
-        file.create_dataset("trg_embedding/norm_bias", data=dec_norm_b, dtype="f4")
-        file.create_dataset("trg_embedding/shared_bias", data=dec_shared_b, dtype="f4")
+    transformer.src_embedding.norm_scale[:] = enc_norm_w
+    transformer.src_embedding.norm_bias[:] = enc_norm_b
+    transformer.trg_embedding.norm_scale[:] = dec_norm_w
+    transformer.trg_embedding.norm_bias[:] = dec_norm_b
+    transformer.trg_embedding.shared_bias[:] = dec_shared_b
 
 
-def export_ls_fs_transformer(ckpt_path, out_path, save_pb=True):
-    with open(ckpt_path, "rb") as fin:
+def export_ls_fs_transformer(model_path, pb_path, hdf5_path, hdf5):
+    with open(model_path, "rb") as fin:
         ckpt_file = torch.load(fin)
     args = ckpt_file["args"]
     state_dict = ckpt_file["model"]
 
-    if save_pb:
-        file = Transformer()
-    else:
-        file = h5py.File(out_path, "w")
+    transformer = Transformer()
     encoder_state_dict, decoder_state_dict = _extract_weight(state_dict)
-    export_ls_embedding(file, encoder_state_dict, 300, True, save_pb)
-    export_ls_embedding(file, decoder_state_dict, 300, False, save_pb)
+    export_ls_embedding(transformer, encoder_state_dict, 300, True, save_pb=True)
+    export_ls_embedding(transformer, decoder_state_dict, 300, False, save_pb=True)
     export_ls_encoder(
-        file,
+        transformer,
         encoder_state_dict,
         args.encoder_embed_dim,
         args.encoder_ffn_embed_dim,
-        save_pb,
+        save_pb=True,
     )
     export_ls_decoder(
-        file,
+        transformer,
         decoder_state_dict,
         args.decoder_embed_dim,
         args.decoder_ffn_embed_dim,
         args.decoder_layers,
-        save_pb,
+        save_pb=True,
     )
-    export_fs_weights(file, state_dict, save_pb)
+    export_fs_weights(transformer, state_dict)
     export_ls_config(
-        file,
+        transformer,
         args.encoder_attention_heads,
         1,
         2,
         2,
         args.encoder_layers,
         args.decoder_layers,
-        save_pb=save_pb,
+        save_pb=True,
     )
 
-    if save_pb:
-        with open(out_path, "wb") as fout:
-            fout.write(file.SerializeToString())
-    else:
-        file.close()
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="export fairseq checkpoint", usage="")
-    parser.add_argument(
-        "--model",
-        "-m",
-        type=str,
-        default="checkpoint_best.pt",
-        help="path of fairseq checkpoint",
-    )
-    args = parser.parse_args()
-    return args
+    save_path = save_model(transformer, pb_path, hdf5_path, hdf5)
+    return save_path
 
 
 if __name__ == "__main__":
@@ -115,15 +88,9 @@ def parse_args():
     model_name = ".".join(args.model.split(".")[:-1])
     pb_path = f"{model_name}.pb"
     hdf5_path = f"{model_name}.hdf5"
-    print("export to pb model >>>>>>")
-    export_ls_fs_transformer(args.model, pb_path)
-    print("export to hdf5 model >>>>>>")
-    export_ls_fs_transformer(args.model, hdf5_path, save_pb=False)
+    path = export_ls_fs_transformer(args.model, pb_path, hdf5_path, args.hdf5)
     src = [[63, 47, 65, 1507, 88, 74, 10, 2057, 362, 9, 284, 6, 2, 1, 1, 1]]
-    pb_model = lsi.Transformer(pb_path, 8)
-    pb_output = pb_model.infer(src)
-    hdf5_model = lsi.Transformer(hdf5_path, 8)
-    hdf5_output = hdf5_model.infer(src)
+    model = lsi.Transformer(path, 8)
+    output = model.infer(src)
     # Expected result: [23, 550, 34, 118, 148, 2939, 4, 42, 32, 37, 6, 224, 10, 179, 5, 2]
-    print("pb results:", pb_output)
-    print("hdf5 results:", hdf5_output)
+    print("results:", output)
diff --git a/examples/inference/python/export/fairseq/ls_fs_transformer_ptq_export.py b/examples/inference/python/export/fairseq/ls_fs_transformer_ptq_export.py
index 2aeeba23..ae093990 100644
--- a/examples/inference/python/export/fairseq/ls_fs_transformer_ptq_export.py
+++ b/examples/inference/python/export/fairseq/ls_fs_transformer_ptq_export.py
@@ -1,11 +1,9 @@
 """
-Export Fairseq Transformer models training with LightSeq to protobuf format,
-and then using int8 quantization to speedup inference.
+Export Fairseq Transformer models training with LightSeq modules
+to int8 protobuf format using post training quantization.
 Refer to the `examples/training/fairseq` directory for more training details.
 """
-import argparse
 import torch
-import h5py
 from export.proto.quant_transformer_pb2 import QuantTransformer
 from lightseq.training import (
     export_ls_config,
@@ -14,6 +12,7 @@
     export_ls_decoder_ptq,
 )
 import lightseq.inference as lsi
+from export.util import parse_args, save_model
 
 
 # adjust this value to achieve better performance
@@ -31,7 +30,7 @@ def _extract_weight(state_dict):
     return encoder_state_dict, decoder_state_dict
 
 
-def export_fs_weights(file, state_dict, save_pb=True):
+def export_fs_weights(transformer, state_dict):
     enc_norm_w = state_dict["encoder.layer_norm.weight"].flatten().tolist()
     enc_norm_b = state_dict["encoder.layer_norm.bias"].flatten().tolist()
     dec_norm_w = state_dict["decoder.layer_norm.weight"].flatten().tolist()
@@ -41,89 +40,76 @@ def export_fs_weights(file, state_dict, save_pb=True):
         .flatten()
         .tolist()
     )
-    file.src_embedding.norm_scale[:] = enc_norm_w
-    file.src_embedding.norm_bias[:] = enc_norm_b
-    file.trg_embedding.norm_scale[:] = dec_norm_w
-    file.trg_embedding.norm_bias[:] = dec_norm_b
-    file.trg_embedding.shared_bias[:] = dec_shared_b
+    transformer.src_embedding.norm_scale[:] = enc_norm_w
+    transformer.src_embedding.norm_bias[:] = enc_norm_b
+    transformer.trg_embedding.norm_scale[:] = dec_norm_w
+    transformer.trg_embedding.norm_bias[:] = dec_norm_b
+    transformer.trg_embedding.shared_bias[:] = dec_shared_b
 
 
-def export_ls_fs_transformer_ptq(ckpt_path, out_path, save_pb=True):
-    with open(ckpt_path, "rb") as fin:
+def export_ls_fs_transformer_ptq(model_path, pb_path, hdf5_path, hdf5):
+    with open(model_path, "rb") as fin:
         ckpt_file = torch.load(fin)
     args = ckpt_file["args"]
     state_dict = ckpt_file["model"]
 
-    file = QuantTransformer()
+    transformer = QuantTransformer()
     encoder_state_dict, decoder_state_dict = _extract_weight(state_dict)
     export_ls_embedding_ptq(
-        file,
+        transformer,
         encoder_state_dict,
         300,
         True,
-        save_pb=save_pb,
+        save_pb=True,
     )
     export_ls_embedding_ptq(
-        file,
+        transformer,
         decoder_state_dict,
         300,
         False,
-        save_pb=save_pb,
+        save_pb=True,
     )
     export_ls_encoder_ptq(
-        file,
+        transformer,
         encoder_state_dict,
         args.encoder_embed_dim,
         args.encoder_ffn_embed_dim,
         act_clip_max=global_act_clip_max,
-        save_pb=save_pb,
+        save_pb=True,
     )
     export_ls_decoder_ptq(
-        file,
+        transformer,
         decoder_state_dict,
         args.decoder_embed_dim,
         args.decoder_ffn_embed_dim,
         args.decoder_layers,
         act_clip_max=global_act_clip_max,
-        save_pb=save_pb,
+        save_pb=True,
     )
-    export_fs_weights(file, state_dict, save_pb)
+    export_fs_weights(transformer, state_dict)
     export_ls_config(
-        file,
+        transformer,
         args.encoder_attention_heads,
         1,
         2,
         2,
         args.encoder_layers,
         args.decoder_layers,
-        save_pb=save_pb,
+        save_pb=True,
     )
 
-    with open(out_path, "wb") as fout:
-        fout.write(file.SerializeToString())
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="export fairseq checkpoint", usage="")
-    parser.add_argument(
-        "--model",
-        "-m",
-        type=str,
-        default="checkpoint_best.pt",
-        help="path of fairseq checkpoint",
-    )
-    args = parser.parse_args()
-    return args
+    save_path = save_model(transformer, pb_path, hdf5_path, hdf5)
+    return save_path
 
 
 if __name__ == "__main__":
     args = parse_args()
     model_name = ".".join(args.model.split(".")[:-1])
     pb_path = f"{model_name}_ptq.pb"
-    print("export to pb model >>>>>>")
-    export_ls_fs_transformer_ptq(args.model, pb_path)
+    hdf5_path = f"{model_name}_ptq.hdf5"
+    path = export_ls_fs_transformer_ptq(args.model, pb_path, hdf5_path, args.hdf5)
     src = [[63, 47, 65, 1507, 88, 74, 10, 2057, 362, 9, 284, 6, 2, 1, 1, 1]]
-    pb_model = lsi.QuantTransformer(pb_path, 8)
-    pb_output = pb_model.infer(src)
-    # FP16 result: [23, 550, 34, 118, 148, 2939, 4, 42, 32, 37, 6, 224, 10, 179, 5, 2]
-    print("pb results:", pb_output)
+    model = lsi.QuantTransformer(path, 8)
+    output = model.infer(src)
+    # Expected result: [23, 550, 34, 118, 148, 2939, 4, 42, 32, 37, 6, 224, 10, 179, 5, 2]
+    print("results:", output)
diff --git a/examples/inference/python/export/fairseq/ls_torch_fs_quant_transformer_export.py b/examples/inference/python/export/fairseq/ls_torch_fs_quant_transformer_export.py
index 6a05cecb..f10abfcb 100644
--- a/examples/inference/python/export/fairseq/ls_torch_fs_quant_transformer_export.py
+++ b/examples/inference/python/export/fairseq/ls_torch_fs_quant_transformer_export.py
@@ -1,20 +1,20 @@
 """
-Export quantized Fairseq Transformer models training using custom Torch layers to protobuf/hdf5 format.
+Export quantized Fairseq Transformer models training with custom Torch layers
+and other LightSeq modules to int8 protobuf format.
 Refer to the `examples/training/fairseq` directory for more training details.
 """
 from collections import OrderedDict
-import argparse
 
 import torch
-import tensorflow as tf
 from export.proto.quant_transformer_pb2 import QuantTransformer
 from lightseq.training.ops.pytorch.export import export_ls_config, apply_rule
-from lightseq.training.ops.pytorch.export_ptq import (
+from lightseq.training.ops.pytorch.export_quant import (
     gather_quant_token_embedding,
     quantize,
 )
 from lightseq.training.ops.pytorch.util import get_pos_embedding
 import lightseq.inference as lsi
+from export.util import parse_args, save_model
 
 
 enc_layer_mapping_dict = OrderedDict(
@@ -147,8 +147,10 @@ def fill_quant_pb_layer(tensor_names, state_dict, layer, mapping_dict):
 
 
 def export_ls_torch_fs_quant_transformer(
-    model_dir,
+    model_path,
     pb_path,
+    hdf5_path,
+    hdf5,
     max_step=300,
     bos_id=2,
     eos_id=2,
@@ -156,7 +158,7 @@ def export_ls_torch_fs_quant_transformer(
 ):
     transformer = QuantTransformer()
     # load var names
-    reloaded = torch.load(model_dir, "cpu")
+    reloaded = torch.load(model_path, "cpu")
     args = reloaded["args"]
     model_dict = reloaded["model"]
 
@@ -304,31 +306,20 @@ def export_ls_torch_fs_quant_transformer(
         save_pb=True,
     )
 
-    print("Writing to {0}".format(pb_path))
-    with tf.io.gfile.GFile(pb_path, "wb") as fout:
-        fout.write(transformer.SerializeToString())
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="export fairseq checkpoint", usage="")
-    parser.add_argument(
-        "--model",
-        "-m",
-        type=str,
-        default="checkpoint_best.pt",
-        help="path of fairseq checkpoint",
-    )
-    args = parser.parse_args()
-    return args
+    save_path = save_model(transformer, pb_path, hdf5_path, hdf5)
+    return save_path
 
 
 if __name__ == "__main__":
     args = parse_args()
     model_name = ".".join(args.model.split(".")[:-1])
     pb_path = f"{model_name}.pb"
-    export_ls_torch_fs_quant_transformer(args.model, pb_path)
+    hdf5_path = f"{model_name}.hdf5"
+    path = export_ls_torch_fs_quant_transformer(
+        args.model, pb_path, hdf5_path, args.hdf5
+    )
     src = [[63, 47, 65, 1507, 88, 74, 10, 2057, 362, 9, 284, 6, 2, 1, 1, 1]]
-    pb_model = lsi.QuantTransformer(pb_path, 8)
-    pb_output = pb_model.infer(src)
+    model = lsi.QuantTransformer(path, 8)
+    output = model.infer(src)
     # Expected result: [23, 550, 34, 118, 148, 2939, 4, 42, 32, 37, 6, 224, 10, 179, 5, 2]
-    print("pb results:", pb_output)
+    print("results:", output)
diff --git a/examples/inference/python/export/fairseq/ls_torch_fs_transformer_export.py b/examples/inference/python/export/fairseq/ls_torch_fs_transformer_export.py
index 4f9d8267..ea223d53 100644
--- a/examples/inference/python/export/fairseq/ls_torch_fs_transformer_export.py
+++ b/examples/inference/python/export/fairseq/ls_torch_fs_transformer_export.py
@@ -1,12 +1,11 @@
 """
-Export Fairseq Transformer models training using custom Torch layers to protobuf/hdf5 format.
+Export Fairseq Transformer models training with custom Torch layers
+and other LightSeq modules to protobuf format.
 Refer to the `examples/training/fairseq` directory for more training details.
 """
 from collections import OrderedDict
-import argparse
 
 import torch
-import tensorflow as tf
 from export.proto.transformer_pb2 import Transformer
 from lightseq.training.ops.pytorch.export import (
     gather_token_embedding,
@@ -15,6 +14,7 @@
 )
 from lightseq.training.ops.pytorch.util import get_pos_embedding
 import lightseq.inference as lsi
+from export.util import parse_args, save_model
 
 
 enc_layer_mapping_dict = OrderedDict(
@@ -91,8 +91,10 @@ def _get_encode_output_mapping_dict(dec_layer_num):
 
 
 def export_ls_torch_fs_transformer(
-    model_dir,
+    model_path,
     pb_path,
+    hdf5_path,
+    hdf5,
     max_step=300,
     bos_id=2,
     eos_id=2,
@@ -100,7 +102,7 @@ def export_ls_torch_fs_transformer(
 ):
     transformer = Transformer()
     # load var names
-    reloaded = torch.load(model_dir, "cpu")
+    reloaded = torch.load(model_path, "cpu")
     args = reloaded["args"]
     model_dict = reloaded["model"]
 
@@ -229,31 +231,18 @@ def export_ls_torch_fs_transformer(
         save_pb=True,
     )
 
-    print("Writing to {0}".format(pb_path))
-    with tf.io.gfile.GFile(pb_path, "wb") as fout:
-        fout.write(transformer.SerializeToString())
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="export fairseq checkpoint", usage="")
-    parser.add_argument(
-        "--model",
-        "-m",
-        type=str,
-        default="checkpoint_best.pt",
-        help="path of fairseq checkpoint",
-    )
-    args = parser.parse_args()
-    return args
+    save_path = save_model(transformer, pb_path, hdf5_path, hdf5)
+    return save_path
 
 
 if __name__ == "__main__":
     args = parse_args()
     model_name = ".".join(args.model.split(".")[:-1])
     pb_path = f"{model_name}.pb"
-    export_ls_torch_fs_transformer(args.model, pb_path)
+    hdf5_path = f"{model_name}.hdf5"
+    path = export_ls_torch_fs_transformer(args.model, pb_path, hdf5_path, args.hdf5)
     src = [[63, 47, 65, 1507, 88, 74, 10, 2057, 362, 9, 284, 6, 2, 1, 1, 1]]
-    pb_model = lsi.Transformer(pb_path, 8)
-    pb_output = pb_model.infer(src)
+    model = lsi.Transformer(path, 8)
+    output = model.infer(src)
     # Expected result: [23, 550, 34, 118, 148, 2939, 4, 42, 32, 37, 6, 224, 10, 179, 5, 2]
-    print("pb results:", pb_output)
+    print("results:", output)
diff --git a/examples/inference/python/export/fairseq/ls_torch_fs_transformer_ptq_export.py b/examples/inference/python/export/fairseq/ls_torch_fs_transformer_ptq_export.py
index c6498893..2ab259e9 100644
--- a/examples/inference/python/export/fairseq/ls_torch_fs_transformer_ptq_export.py
+++ b/examples/inference/python/export/fairseq/ls_torch_fs_transformer_ptq_export.py
@@ -1,20 +1,20 @@
 """
-Export PTQ Fairseq Transformer models training using custom Torch layers to protobuf/hdf5 format.
+Export Fairseq Transformer models training with custom Torch layers
+and other LightSeq modules to int8 protobuf format using post training quantization.
 Refer to the `examples/training/fairseq` directory for more training details.
 """
 from collections import OrderedDict
-import argparse
 
 import torch
-import tensorflow as tf
 from export.proto.quant_transformer_pb2 import QuantTransformer
 from lightseq.training.ops.pytorch.export import export_ls_config
-from lightseq.training.ops.pytorch.export_ptq import (
+from lightseq.training.ops.pytorch.export_quant import (
     gather_quant_token_embedding,
     fill_quant_pb_layer,
 )
 from lightseq.training.ops.pytorch.util import get_pos_embedding
 import lightseq.inference as lsi
+from export.util import parse_args, save_model
 
 
 # adjust this value to achieve better performance
@@ -117,8 +117,10 @@ def _get_encode_output_mapping_dict(dec_layer_num):
 
 
 def export_ls_torch_fs_transformer_ptq(
-    model_dir,
+    model_path,
     pb_path,
+    hdf5_path,
+    hdf5,
     max_step=300,
     bos_id=2,
     eos_id=2,
@@ -126,7 +128,7 @@ def export_ls_torch_fs_transformer_ptq(
 ):
     transformer = QuantTransformer()
     # load var names
-    reloaded = torch.load(model_dir, "cpu")
+    reloaded = torch.load(model_path, "cpu")
     args = reloaded["args"]
     model_dict = reloaded["model"]
 
@@ -266,31 +268,18 @@ def export_ls_torch_fs_transformer_ptq(
         save_pb=True,
     )
 
-    print("Writing to {0}".format(pb_path))
-    with tf.io.gfile.GFile(pb_path, "wb") as fout:
-        fout.write(transformer.SerializeToString())
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="export fairseq checkpoint", usage="")
-    parser.add_argument(
-        "--model",
-        "-m",
-        type=str,
-        default="checkpoint_best.pt",
-        help="path of fairseq checkpoint",
-    )
-    args = parser.parse_args()
-    return args
+    save_path = save_model(transformer, pb_path, hdf5_path, hdf5)
+    return save_path
 
 
 if __name__ == "__main__":
     args = parse_args()
     model_name = ".".join(args.model.split(".")[:-1])
     pb_path = f"{model_name}_ptq.pb"
-    export_ls_torch_fs_transformer_ptq(args.model, pb_path)
+    hdf5_path = f"{model_name}_ptq.hdf5"
+    path = export_ls_torch_fs_transformer_ptq(args.model, pb_path, hdf5_path, args.hdf5)
     src = [[63, 47, 65, 1507, 88, 74, 10, 2057, 362, 9, 284, 6, 2, 1, 1, 1]]
-    pb_model = lsi.QuantTransformer(pb_path, 8)
-    pb_output = pb_model.infer(src)
+    model = lsi.QuantTransformer(path, 8)
+    output = model.infer(src)
     # Expected result: [23, 550, 34, 118, 148, 2939, 4, 42, 32, 37, 6, 224, 10, 179, 5, 2]
-    print("pb results:", pb_output)
+    print("results:", output)
diff --git a/examples/inference/python/export/fairseq/native_fs_transformer_export.py b/examples/inference/python/export/fairseq/native_fs_transformer_export.py
index 0b77fd19..49e8aab8 100644
--- a/examples/inference/python/export/fairseq/native_fs_transformer_export.py
+++ b/examples/inference/python/export/fairseq/native_fs_transformer_export.py
@@ -3,20 +3,17 @@
 Refer to the `examples/training/fairseq` directory for more training details.
 """
 from collections import OrderedDict
-import argparse
 
 import torch
-import tensorflow as tf
-import h5py
 from export.proto.transformer_pb2 import Transformer
 from lightseq.training.ops.pytorch.export import (
     gather_token_embedding,
     fill_pb_layer,
     export_ls_config,
-    export_pb2hdf5,
 )
 from lightseq.training.ops.pytorch.util import get_pos_embedding
 import lightseq.inference as lsi
+from export.util import parse_args, save_model
 
 
 enc_layer_mapping_dict = OrderedDict(
@@ -93,9 +90,10 @@ def _get_encode_output_mapping_dict(dec_layer_num):
 
 
 def export_native_fs_transformer(
-    model_dir,
+    model_path,
     pb_path,
     hdf5_path,
+    hdf5,
     max_step=300,
     bos_id=2,
     eos_id=2,
@@ -103,7 +101,7 @@ def export_native_fs_transformer(
 ):
     transformer = Transformer()
     # load var names
-    reloaded = torch.load(model_dir, "cpu")
+    reloaded = torch.load(model_path, "cpu")
     args = reloaded["args"]
     model_dict = reloaded["model"]
 
@@ -234,27 +232,8 @@ def export_native_fs_transformer(
         save_pb=True,
     )
 
-    print("Writing to {0}".format(pb_path))
-    with tf.io.gfile.GFile(pb_path, "wb") as fout:
-        fout.write(transformer.SerializeToString())
-
-    print("Writing to {0}".format(hdf5_path))
-    f = h5py.File(hdf5_path, "w")
-    export_pb2hdf5(transformer, f)
-    f.close()
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="export fairseq checkpoint", usage="")
-    parser.add_argument(
-        "--model",
-        "-m",
-        type=str,
-        default="checkpoint_best.pt",
-        help="path of fairseq checkpoint",
-    )
-    args = parser.parse_args()
-    return args
+    save_path = save_model(transformer, pb_path, hdf5_path, hdf5)
+    return save_path
 
 
 if __name__ == "__main__":
@@ -262,9 +241,9 @@ def parse_args():
     model_name = ".".join(args.model.split(".")[:-1])
     pb_path = f"{model_name}.pb"
     hdf5_path = f"{model_name}.hdf5"
-    export_native_fs_transformer(args.model, pb_path, hdf5_path)
+    path = export_native_fs_transformer(args.model, pb_path, hdf5_path, args.hdf5)
     src = [[63, 47, 65, 1507, 88, 74, 10, 2057, 362, 9, 284, 6, 2, 1, 1, 1]]
-    pb_model = lsi.Transformer(pb_path, 8)
-    pb_output = pb_model.infer(src)
+    model = lsi.Transformer(path, 8)
+    output = model.infer(src)
     # Expected result: [23, 550, 34, 118, 148, 2939, 4, 42, 32, 37, 6, 224, 10, 179, 5, 2]
-    print("pb results:", pb_output)
+    print("results:", output)
diff --git a/examples/inference/python/export/fairseq/native_fs_transformer_ptq_export.py b/examples/inference/python/export/fairseq/native_fs_transformer_ptq_export.py
index 446605f9..af704b6f 100644
--- a/examples/inference/python/export/fairseq/native_fs_transformer_ptq_export.py
+++ b/examples/inference/python/export/fairseq/native_fs_transformer_ptq_export.py
@@ -1,20 +1,19 @@
 """
-Export PTQ native Fairseq Transformer models to protobuf/hdf5 format.
+Export native Fairseq Transformer models to int8 protobuf format using post training quantization.
 Refer to the `examples/training/fairseq` directory for more training details.
 """
 from collections import OrderedDict
-import argparse
 
 import torch
-import tensorflow as tf
 from export.proto.quant_transformer_pb2 import QuantTransformer
 from lightseq.training.ops.pytorch.export import export_ls_config
-from lightseq.training.ops.pytorch.export_ptq import (
+from lightseq.training.ops.pytorch.export_quant import (
     gather_quant_token_embedding,
     fill_quant_pb_layer,
 )
 from lightseq.training.ops.pytorch.util import get_pos_embedding
 import lightseq.inference as lsi
+from export.util import parse_args, save_model
 
 
 # adjust this value to achieve better performance
@@ -118,8 +117,10 @@ def _get_encode_output_mapping_dict(dec_layer_num):
 
 
 def export_native_fs_transformer(
-    model_dir,
+    model_path,
     pb_path,
+    hdf5_path,
+    hdf5,
     max_step=300,
     bos_id=2,
     eos_id=2,
@@ -127,7 +128,7 @@ def export_native_fs_transformer(
 ):
     transformer = QuantTransformer()
     # load var names
-    reloaded = torch.load(model_dir, "cpu")
+    reloaded = torch.load(model_path, "cpu")
     args = reloaded["args"]
     model_dict = reloaded["model"]
 
@@ -267,31 +268,19 @@ def export_native_fs_transformer(
         save_pb=True,
     )
 
-    print("Writing to {0}".format(pb_path))
-    with tf.io.gfile.GFile(pb_path, "wb") as fout:
-        fout.write(transformer.SerializeToString())
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="export fairseq checkpoint", usage="")
-    parser.add_argument(
-        "--model",
-        "-m",
-        type=str,
-        default="checkpoint_best.pt",
-        help="path of fairseq checkpoint",
-    )
-    args = parser.parse_args()
-    return args
+    save_path = save_model(transformer, pb_path, hdf5_path, hdf5)
+    return save_path
 
 
 if __name__ == "__main__":
     args = parse_args()
     model_name = ".".join(args.model.split(".")[:-1])
     pb_path = f"{model_name}_ptq.pb"
-    export_native_fs_transformer(args.model, pb_path)
+    hdf5_path = f"{model_name}_ptq.hdf5"
+    path = export_native_fs_transformer(args.model, pb_path, hdf5_path, args.hdf5)
+    src = [[63, 47, 65, 1507, 88, 74, 10, 2057, 362, 9, 284, 6, 2, 1, 1, 1]]
     src = [[63, 47, 65, 1507, 88, 74, 10, 2057, 362, 9, 284, 6, 2, 1, 1, 1]]
-    pb_model = lsi.QuantTransformer(pb_path, 8)
-    pb_output = pb_model.infer(src)
+    model = lsi.QuantTransformer(path, 8)
+    output = model.infer(src)
     # Expected result: [23, 550, 34, 118, 148, 2939, 4, 42, 32, 37, 6, 224, 10, 179, 5, 2]
-    print("pb results:", pb_output)
+    print("results:", output)
diff --git a/examples/inference/python/export/huggingface/hf_bart_export.py b/examples/inference/python/export/huggingface/hf_bart_export.py
index 82a0effd..d4f6e519 100644
--- a/examples/inference/python/export/huggingface/hf_bart_export.py
+++ b/examples/inference/python/export/huggingface/hf_bart_export.py
@@ -11,6 +11,7 @@
 from lightseq.training.ops.pytorch.export import gather_token_embedding, fill_pb_layer
 from export.proto.transformer_pb2 import Transformer
 from transformers import BartForConditionalGeneration
+from export.util import parse_args
 
 os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
 
@@ -512,14 +513,15 @@ def _print_pair(key, value):
 
 
 if __name__ == "__main__":
+    args = parse_args()
+    if args.generation_method not in ["beam_search", "topk", "topp", "topk_greedy"]:
+        args.generation_method = "beam_search"
     # if save_proto is True, extension .pb will be added, otherwise .hdf5 is added
     output_lightseq_model_name = "lightseq_bart_base"  # you can rename it to "lightseq_bart_large" for large model
     input_huggingface_bart_model = (
         "facebook/bart-base"  # Example: you can try "facebook/bart-large" as well
     )
     head_number = 12  # change this to 16 for "bart-large" model
-    # in order to get score, we should use `beam_search` inference method
-    generation_method = "beam_search"
     beam_size = 4
     max_step = 50  # max step for generation, it decides GPU memory occupancy
     # maximum_generation_length = min(src_length + extra_decode_length, max_step)
@@ -529,7 +531,7 @@ def _print_pair(key, value):
         output_lightseq_model_name,
         input_huggingface_bart_model,
         head_num=head_number,  # layer number
-        generation_method=generation_method,
+        generation_method=args.generation_method,
         beam_size=beam_size,
         max_step=max_step,
         extra_decode_length=extra_decode_length,
diff --git a/examples/inference/python/export/huggingface/hf_gpt2_export.py b/examples/inference/python/export/huggingface/hf_gpt2_export.py
index 89a12b42..aa559a10 100644
--- a/examples/inference/python/export/huggingface/hf_gpt2_export.py
+++ b/examples/inference/python/export/huggingface/hf_gpt2_export.py
@@ -7,6 +7,7 @@
 from collections import OrderedDict
 from transformers import GPT2LMHeadModel
 from lightseq.training.ops.pytorch.export import fill_hdf5_layer
+from export.util import parse_args
 
 os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
 
@@ -146,11 +147,12 @@ def _print_pair(key, value):
 
 
 if __name__ == "__main__":
+    args = parse_args()
+    if args.generation_method not in ["topk", "topp", "ppl"]:
+        args.generation_method = "topk"
     output_lightseq_model_name = "lightseq_gpt2_base"  # or "lightseq_gpt2_large"
     input_huggingface_gpt_model = "gpt2"  # or "gpt2-large"
     head_number = 12  # 20 for "gpt2-large"
-    # generation_method should be "topk" or "topp"
-    generation_method = "topk"
     topk = 1
     topp = 0.75
     # default eos_id from https://huggingface.co/transformers/model_doc/gpt2.html#gpt2lmheadmodel
@@ -161,7 +163,7 @@ def _print_pair(key, value):
         output_lightseq_model_name,
         input_huggingface_gpt_model,
         head_num=head_number,  # layer number
-        generation_method=generation_method,
+        generation_method=args.generation_method,
         topk=topk,
         topp=topp,
         eos_id=eos_id,
diff --git a/examples/inference/python/export/huggingface/ls_torch_hf_quant_bert_export.py b/examples/inference/python/export/huggingface/ls_torch_hf_quant_bert_export.py
new file mode 100644
index 00000000..72f18e90
--- /dev/null
+++ b/examples/inference/python/export/huggingface/ls_torch_hf_quant_bert_export.py
@@ -0,0 +1,208 @@
+"""
+Export Hugging Face quantized BERT models to hdf5 format.
+"""
+import os
+import h5py
+from collections import OrderedDict
+
+import torch
+from lightseq.training.ops.pytorch.export import apply_rule
+from lightseq.training.ops.pytorch.export_quant import quantize
+from export.util import parse_args
+
+os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+
+
+"""
+For the mapping dictionary: key is the value of the proto parameter,
+value is a powerful expression, each && split tensor name of the matching path or expression.
+
+The sub-pattern of the path is separated by spaces, and the expression starts with a expression_.
+You can operate separately on each tensor and support multiple expressions. Multiple matching paths
+and the expression will finally be concatenated on axis = -1.
+"""
+enc_layer_mapping_dict = OrderedDict(
+    {
+        # BERT is post_layernorm
+        "multihead_norm_scale": "self_attn_layer_norm weight",
+        "multihead_norm_bias": "self_attn_layer_norm bias",
+        "multihead_project_kernel_qkv": "self_attn qkv_proj weight&&expression_.transpose(0, 1)",
+        "multihead_project_bias_qkv": "self_attn qkv_proj bias",
+        "multihead_project_kernel_output": "self_attn out_proj weight&&expression_.transpose(0, 1)",
+        "multihead_project_bias_output": "self_attn out_proj bias",
+        "ffn_norm_scale": "final_layer_norm weight",
+        "ffn_norm_bias": "final_layer_norm bias",
+        "ffn_first_kernel": "fc1 weight&&expression_.transpose(0, 1)",
+        "ffn_first_bias": "fc1 bias",
+        "ffn_second_kernel": "fc2 weight&&expression_.transpose(0, 1)",
+        "ffn_second_bias": "fc2 bias",
+        # weight_clip_max
+        "multihead_project_kernel_qkv_clip_max": "self_attn qkv_proj weight_quant clip_value_max",
+        "multihead_project_kernel_output_clip_max": "self_attn out_proj weight_quant clip_value_max",
+        "ffn_first_kernel_clip_max": "fc1 weight_quant clip_value_max",
+        "ffn_second_kernel_clip_max": "fc2 weight_quant clip_value_max",
+        # act_clip_max
+        "multihead_ln_clip_max": "self_attn qkv_proj input_quant clip_value_max",
+        "multihead_project_output_clip_max": "self_attn out_proj input_quant clip_value_max",
+        "ffn_ln_clip_max": "fc1 input_quant clip_value_max",
+        "ffn_first_act_clip_max": "fc2 input_quant clip_value_max",
+        "multihead_qkv_dense_clip_max": "self_attn qkv_proj output_quant clip_value_max",
+        "multihead_output_dense_clip_max": "self_attn out_proj output_quant clip_value_max",
+        "ffn_first_output_clip_max": "fc1 output_quant clip_value_max",
+    }
+)
+
+src_emb_mapping_dict = OrderedDict(
+    {
+        "norm_scale": "embeddings LayerNorm weight",
+        "norm_bias": "embeddings LayerNorm bias",
+        "position_embedding": "embeddings position_embeddings weight",
+    }
+)
+
+
+def fill_quant_hdf5_layer(
+    tensor_names, state_dict, hdf5_file, hdf5_dataset_prefix, mapping_dict
+):
+    for proto_name, ckpt_rule in mapping_dict.items():
+        target_tensor = apply_rule(proto_name, ckpt_rule, tensor_names, state_dict)
+        if proto_name.endswith("_clip_max"):
+            hdf5_file.create_dataset(
+                hdf5_dataset_prefix + proto_name, data=float(target_tensor[0])
+            )
+        else:
+            hdf5_file.create_dataset(
+                hdf5_dataset_prefix + proto_name,
+                data=target_tensor,
+            )
+
+
+def extract_bert_weights(
+    output_file,
+    model_dir,
+    head_num,
+    pad_id=0,
+    max_step=50,
+):
+    # load var names
+    state_dict = torch.load(model_dir, "cpu")
+
+    var_name_list = list(state_dict.keys())
+
+    for name in var_name_list:
+        if name.endswith("weight_quant.clip.clip_value_max"):
+            state_dict[name[:-26]] = torch.Tensor(
+                quantize(state_dict[name[:-26]].numpy(), 127, state_dict[name].numpy())
+            ).to(torch.uint8)
+
+    # initialize output file
+    print("Saving model to hdf5...")
+    print("Writing to {0}".format(output_file))
+    hdf5_file = h5py.File(output_file, "w")
+
+    # fill each encoder layer's params
+    enc_tensor_names = {}
+    for name in var_name_list:
+        name_split = name.split(".")
+        if len(name_split) <= 3 or not name_split[3].isdigit():
+            continue
+        layer_id = int(name_split[3])
+        enc_tensor_names.setdefault(layer_id, []).append(name)
+
+    # fill encoder_stack
+    for layer_id in sorted(enc_tensor_names.keys()):
+        fill_quant_hdf5_layer(
+            enc_tensor_names[layer_id],
+            state_dict,
+            hdf5_file,
+            f"encoder_stack/{layer_id}/",
+            enc_layer_mapping_dict,
+        )
+
+    # fill src_embedding - except for position embedding
+    fill_quant_hdf5_layer(
+        var_name_list,
+        state_dict,
+        hdf5_file,
+        "src_embedding/",
+        src_emb_mapping_dict,
+    )
+
+    # handling token_embeddings for BERT
+    token_embedding = (
+        state_dict["bert.embeddings.word_embeddings.weight"]
+        + state_dict["bert.embeddings.token_type_embeddings.weight"][0]
+    )
+    token_embedding = quantize(
+        token_embedding.numpy(),
+        127,
+        state_dict["bert.embeddings.emb_quant.clip.clip_value_max"].numpy(),
+    )
+    print(f"processed token_embedding, shape: {token_embedding.shape}")
+    hdf5_file.create_dataset(
+        "src_embedding/token_embedding", data=token_embedding, dtype="uint8"
+    )
+    hdf5_file.create_dataset(
+        "src_embedding/emb_clip_max",
+        data=state_dict["bert.embeddings.emb_quant.clip.clip_value_max"],
+    )
+
+    # save number of layers metadata
+    hdf5_file.create_dataset(
+        "model_conf/n_encoder_stack", data=len(enc_tensor_names), dtype="i4"
+    )
+    # fill in model_conf
+    hdf5_file.create_dataset("model_conf/head_num", data=head_num, dtype="i4")
+    hdf5_file.create_dataset("model_conf/src_padding_id", data=pad_id, dtype="i4")
+    hdf5_file.create_dataset("model_conf/is_post_ln", data=True, dtype="?")
+    hdf5_file.create_dataset("model_conf/use_gelu", data=True, dtype="?")
+
+    # Move layernorm weights to match layernorm implementation in lightseq
+    tmp_scale, tmp_bias = (
+        hdf5_file["src_embedding/norm_scale"][()],
+        hdf5_file["src_embedding/norm_bias"][()],
+    )
+    for layer_id in sorted(enc_tensor_names.keys()):
+        new_tmp_scale = hdf5_file[f"encoder_stack/{layer_id}/multihead_norm_scale"][()]
+        new_tmp_bias = hdf5_file[f"encoder_stack/{layer_id}/multihead_norm_bias"][()]
+        hdf5_file[f"encoder_stack/{layer_id}/multihead_norm_scale"][()] = tmp_scale
+        hdf5_file[f"encoder_stack/{layer_id}/multihead_norm_bias"][()] = tmp_bias
+        tmp_scale, tmp_bias = new_tmp_scale, new_tmp_bias
+
+        new_tmp_scale = hdf5_file[f"encoder_stack/{layer_id}/ffn_norm_scale"][()]
+        new_tmp_bias = hdf5_file[f"encoder_stack/{layer_id}/ffn_norm_bias"][()]
+        hdf5_file[f"encoder_stack/{layer_id}/ffn_norm_scale"][()] = tmp_scale
+        hdf5_file[f"encoder_stack/{layer_id}/ffn_norm_bias"][()] = tmp_bias
+        tmp_scale, tmp_bias = new_tmp_scale, new_tmp_bias
+    hdf5_file["src_embedding/norm_scale"][()] = tmp_scale
+    hdf5_file["src_embedding/norm_bias"][()] = tmp_bias
+
+    hdf5_file.close()
+    # read-in again to double check
+    hdf5_file = h5py.File(output_file, "r")
+
+    def _print_pair(key, value):
+        if key == "sampling_method":
+            value = "".join(map(chr, value[()]))
+        else:
+            value = value[()]
+        print(f"{key}: {value}")
+
+    list(map(lambda x: _print_pair(*x), hdf5_file["model_conf"].items()))
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    model_name = ".".join(args.model.split(".")[:-1])
+    hdf5_path = f"{model_name}.hdf5"
+
+    head_number = 12
+    pad_id = 0
+    max_step = 50
+    extract_bert_weights(
+        hdf5_path,
+        args.model,
+        head_num=head_number,
+        pad_id=pad_id,
+        max_step=max_step,
+    )
diff --git a/examples/inference/python/export/huggingface/ls_torch_hf_quant_gpt2_export.py b/examples/inference/python/export/huggingface/ls_torch_hf_quant_gpt2_export.py
new file mode 100644
index 00000000..b42bb3c8
--- /dev/null
+++ b/examples/inference/python/export/huggingface/ls_torch_hf_quant_gpt2_export.py
@@ -0,0 +1,223 @@
+"""
+Export Hugging Face quantized GPT2 models to hdf5 format.
+"""
+import os
+import h5py
+from collections import OrderedDict
+
+import numpy as np
+import torch
+from lightseq.training.ops.pytorch.export import apply_rule
+from lightseq.training.ops.pytorch.export_quant import quantize
+from export.util import parse_args
+
+os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+
+
+"""
+For the mapping dictionary: key is the value of the proto parameter,
+value is a powerful expression, each && split tensor name of the matching path or expression.
+
+The sub-pattern of the path is separated by spaces, and the expression starts with a expression_.
+You can operate separately on each tensor and support multiple expressions. Multiple matching paths
+and the expression will finally be concatenated on axis = -1.
+"""
+enc_layer_mapping_dict = OrderedDict(
+    {
+        "multihead_norm_scale": "self_attn_layer_norm weight",
+        "multihead_norm_bias": "self_attn_layer_norm bias",
+        "multihead_project_kernel_qkv": "self_attn qkv_proj weight&&expression_.transpose(0, 1)",
+        "multihead_project_bias_qkv": "self_attn qkv_proj bias",
+        "multihead_project_kernel_output": "self_attn out_proj weight&&expression_.transpose(0, 1)",
+        "multihead_project_bias_output": "self_attn out_proj bias",
+        "ffn_norm_scale": "final_layer_norm weight",
+        "ffn_norm_bias": "final_layer_norm bias",
+        "ffn_first_kernel": "fc1 weight&&expression_.transpose(0, 1)",
+        "ffn_first_bias": "fc1 bias",
+        "ffn_second_kernel": "fc2 weight&&expression_.transpose(0, 1)",
+        "ffn_second_bias": "fc2 bias",
+        # weight_clip_max
+        "multihead_project_kernel_qkv_clip_max": "self_attn qkv_proj weight_quant clip_value_max",
+        "multihead_project_kernel_output_clip_max": "self_attn out_proj weight_quant clip_value_max",
+        "ffn_first_kernel_clip_max": "fc1 weight_quant clip_value_max",
+        "ffn_second_kernel_clip_max": "fc2 weight_quant clip_value_max",
+        # act_clip_max
+        "multihead_ln_clip_max": "self_attn qkv_proj input_quant clip_value_max",
+        "multihead_project_output_clip_max": "self_attn out_proj input_quant clip_value_max",
+        "ffn_ln_clip_max": "fc1 input_quant clip_value_max",
+        "ffn_first_act_clip_max": "fc2 input_quant clip_value_max",
+        "multihead_qkv_dense_clip_max": "self_attn qkv_proj output_quant clip_value_max",
+        "multihead_output_dense_clip_max": "self_attn out_proj output_quant clip_value_max",
+        "ffn_first_output_clip_max": "fc1 output_quant clip_value_max",
+        "self_qkv_bias_out_clip_max": "self_attn attention_quant clip_value_max",
+    }
+)
+
+src_emb_mapping_dict = OrderedDict(
+    {
+        "norm_scale": "ln_f weight",
+        "norm_bias": "ln_f bias",
+        "output_ln_clip_max": "lm_head input_quant clip_value_max",
+        "logits_clip_max": "lm_head output_quant clip_value_max",
+    }
+)
+
+
+def fill_quant_hdf5_layer(
+    tensor_names, state_dict, hdf5_file, hdf5_dataset_prefix, mapping_dict
+):
+    for proto_name, ckpt_rule in mapping_dict.items():
+        target_tensor = apply_rule(proto_name, ckpt_rule, tensor_names, state_dict)
+        if proto_name.endswith("_clip_max"):
+            hdf5_file.create_dataset(
+                hdf5_dataset_prefix + proto_name, data=float(target_tensor[0])
+            )
+        else:
+            hdf5_file.create_dataset(
+                hdf5_dataset_prefix + proto_name,
+                data=target_tensor,
+            )
+
+
+def extract_gpt_weights(
+    output_file,
+    model_dir,
+    head_num,
+    generation_method,
+    topk=1,
+    topp=0.75,
+    eos_id=50256,
+    pad_id=50257,
+    max_step=50,
+):
+    # load var names
+    state_dict = torch.load(model_dir, "cpu")
+
+    var_name_list = list(state_dict.keys())
+
+    for name in var_name_list:
+        if name.endswith("weight_quant.clip.clip_value_max"):
+            state_dict[name[:-26]] = torch.Tensor(
+                quantize(state_dict[name[:-26]].numpy(), 127, state_dict[name].numpy())
+            ).to(torch.uint8)
+
+    # initialize output file
+    print("Saving model to hdf5...")
+    print("Writing to {0}".format(output_file))
+    hdf5_file = h5py.File(output_file, "w")
+
+    # fill each encoder layer's params
+    enc_tensor_names = {}
+    for name in var_name_list:
+        name_split = name.split(".")
+        if len(name_split) <= 2 or not name_split[2].isdigit():
+            continue
+        layer_id = int(name_split[2])
+        enc_tensor_names.setdefault(layer_id, []).append(name)
+
+    # fill encoder_stack
+    for layer_id in sorted(enc_tensor_names.keys()):
+        fill_quant_hdf5_layer(
+            enc_tensor_names[layer_id],
+            state_dict,
+            hdf5_file,
+            f"encoder_stack/{layer_id}/",
+            enc_layer_mapping_dict,
+        )
+
+    # fill src_embedding - except for position embedding
+    fill_quant_hdf5_layer(
+        var_name_list,
+        state_dict,
+        hdf5_file,
+        "src_embedding/",
+        src_emb_mapping_dict,
+    )
+
+    # handling token_embeddings for GPT
+    token_embedding = state_dict["transformer.wte.weight"]
+    token_embedding = quantize(
+        token_embedding.numpy(),
+        127,
+        state_dict["transformer.wte.emb_quant.clip.clip_value_max"].numpy(),
+    ).transpose()
+    print(f"processed token_embedding, shape: {token_embedding.shape}")
+    hdf5_file.create_dataset(
+        "src_embedding/token_embedding", data=token_embedding, dtype="uint8"
+    )
+    hdf5_file.create_dataset(
+        "src_embedding/emb_clip_max",
+        data=state_dict["transformer.wte.emb_quant.clip.clip_value_max"],
+    )
+
+    # special handling for position embedding
+    position_emb = state_dict["transformer.wpe.weight"]
+    _max_allowed_step, _ = position_emb.shape
+    if max_step > _max_allowed_step:
+        print(f"max_step {max_step} exceed max allowed step, abort.")
+        return
+    # truncate position embedding for max_step
+    position_emb = position_emb[:max_step, :]
+    print(
+        f"processed position_embedding with max_step constriant, shape: {position_emb.shape}"
+    )
+    position_emb = position_emb.flatten().tolist()
+    hdf5_file.create_dataset(
+        "src_embedding/position_embedding", data=position_emb, dtype="f4"
+    )
+
+    # save number of layers metadata
+    hdf5_file.create_dataset(
+        "model_conf/n_encoder_stack", data=len(enc_tensor_names), dtype="i4"
+    )
+    # fill in model_conf
+    hdf5_file.create_dataset("model_conf/head_num", data=head_num, dtype="i4")
+    hdf5_file.create_dataset("model_conf/src_padding_id", data=pad_id, dtype="i4")
+    hdf5_file.create_dataset(
+        "model_conf/sampling_method",
+        data=np.array([ord(c) for c in generation_method]).astype(np.int8),
+        dtype="i1",
+    )
+    hdf5_file.create_dataset("model_conf/topp", data=topp, dtype="f4")
+    hdf5_file.create_dataset("model_conf/topk", data=topk, dtype="i4")
+    hdf5_file.create_dataset("model_conf/eos_id", data=eos_id, dtype="i4")
+
+    hdf5_file.close()
+    # read-in again to double check
+    hdf5_file = h5py.File(output_file, "r")
+
+    def _print_pair(key, value):
+        if key == "sampling_method":
+            value = "".join(map(chr, value[()]))
+        else:
+            value = value[()]
+        print(f"{key}: {value}")
+
+    list(map(lambda x: _print_pair(*x), hdf5_file["model_conf"].items()))
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    if args.generation_method not in ["topk", "topp", "ppl"]:
+        args.generation_method = "topk"
+    model_name = ".".join(args.model.split(".")[:-1])
+    hdf5_path = f"{model_name}.hdf5"
+
+    head_number = 12  # 20 for "gpt2-large"
+    topk = 1
+    topp = 0.75
+    # default eos_id from https://huggingface.co/transformers/model_doc/gpt2.html#gpt2lmheadmodel
+    eos_id = 50256
+    pad_id = 50257
+    max_step = 50
+    extract_gpt_weights(
+        hdf5_path,
+        args.model,
+        head_num=head_number,  # layer number
+        generation_method=args.generation_method,
+        topk=topk,
+        topp=topp,
+        eos_id=eos_id,
+        pad_id=pad_id,
+        max_step=max_step,
+    )
diff --git a/examples/inference/python/export/ls_transformer_export.py b/examples/inference/python/export/ls_transformer_export.py
index 4f549e81..49b50820 100644
--- a/examples/inference/python/export/ls_transformer_export.py
+++ b/examples/inference/python/export/ls_transformer_export.py
@@ -1,7 +1,8 @@
 """
-Export LightSeq Transformer models to protobuf/hdf5 format.
+Export LightSeq Transformer models to protobuf format.
 Refer to the `examples/training/custom` directory for more training details.
 """
+import argparse
 import time
 import numpy as np
 import torch
@@ -142,7 +143,7 @@ def create_data():
     )
 
 
-def create_model(vocab_size):
+def create_config(vocab_size):
     transformer_config = LSTransformer.get_config(
         model="transformer-base",
         max_batch_tokens=2048,
@@ -154,29 +155,7 @@ def create_model(vocab_size):
         fp16=True,
         local_rank=0,
     )
-    model = LSTransformer(transformer_config)
-    model.to(dtype=torch.half, device=torch.device("cuda:0"))
-    return model
-
-
-def ls_train_predict(ls_train_model, src_tokens, trg_tokens, batch_size):
-    """
-    NOTE: We do not use beam search here for implementation simplicity.
-    """
-    torch.cuda.synchronize()
-    start_time = time.perf_counter()
-    encoder_out, encoder_padding_mask = ls_train_model.encoder(src_tokens)
-    predict_tokens = trg_tokens[:, :1]
-    cache = {}
-    for _ in range(trg_seq_len - 1):
-        output = ls_train_model.decoder(
-            predict_tokens[:, -1:], encoder_out, encoder_padding_mask, cache
-        )
-        output = torch.reshape(torch.argmax(output, dim=-1), (batch_size, -1))
-        predict_tokens = torch.cat([predict_tokens, output], dim=-1)
-    torch.cuda.synchronize()
-    end_time = time.perf_counter()
-    return predict_tokens, end_time - start_time
+    return transformer_config
 
 
 def ls_predict(ls_infer_model, src_tokens):
@@ -188,6 +167,19 @@ def ls_predict(ls_infer_model, src_tokens):
     return ls_output, end_time - start_time
 
 
+def parse_args():
+    parser = argparse.ArgumentParser(description="export LightSeq checkpoint", usage="")
+    parser.add_argument(
+        "--model",
+        "-m",
+        type=str,
+        default="checkpoint_best.pt",
+        help="path of LightSeq checkpoint",
+    )
+    args = parser.parse_args()
+    return args
+
+
 if __name__ == "__main__":
     (
         tokenizer,
@@ -205,34 +197,23 @@ def ls_predict(ls_infer_model, src_tokens):
         trg_seq_len,
     ) = create_data()
 
-    ckpt_path = "checkpoint.pt"
-    pb_path = "transformer.pb"
+    args = parse_args()
+    model_name = ".".join(args.model.split(".")[:-1])
+    pb_path = f"{model_name}.pb"
 
-    with open(ckpt_path, "rb") as fin:
+    with open(args.model, "rb") as fin:
         state_dict = torch.load(fin, map_location=torch.device("cpu"))
 
-    ls_train_model = create_model(vocab_size)
-    ls_train_model.load_state_dict(state_dict)
-    ls_train_model.eval()
-    print("torch model loaded.")
+    config = create_config(vocab_size)
 
-    export_pb(state_dict, pb_path, pad_id, start_id, end_id, ls_train_model.config)
+    export_pb(state_dict, pb_path, pad_id, start_id, end_id, config)
     ls_infer_model = lsi.Transformer(pb_path, 8)
 
     src_tokens_np = np.array(src_tokens.cpu())
 
     print("========================WARM UP========================")
-    ls_train_predict(ls_train_model, src_tokens, trg_tokens, batch_size)
     ls_predict(ls_infer_model, src_tokens_np)
 
-    print("========================TORCH TEST========================")
-    predict_tokens, ls_train_time = ls_train_predict(
-        ls_train_model, src_tokens, trg_tokens, batch_size
-    )
-    mask = torch.cumsum(torch.eq(predict_tokens, end_id).int(), dim=1)
-    predict_tokens = predict_tokens.masked_fill(mask > 0, end_id)
-    predict_text = tokenizer.batch_decode(predict_tokens, skip_special_tokens=True)
-
     print("========================LIGHTSEQ TEST========================")
     ls_output, ls_time = ls_predict(ls_infer_model, src_tokens_np)
     ls_output = [ids[0] for ids in ls_output[0]]
@@ -242,9 +223,6 @@ def ls_predict(ls_infer_model, src_tokens):
     print("\n".join(src_text))
     print(">>>>> target text")
     print("\n".join(trg_text))
-    print(">>>>> lightseq (train) predict text")
-    print("\n".join(predict_text))
     print(">>>>> lightseq (infer) predict text")
     print("\n".join(ls_predict_text))
-    print("lightseq (train) predict time: {}ms".format(ls_train_time * 1000))
     print("lightseq (infer) predict time: {}ms".format(ls_time * 1000))
diff --git a/examples/inference/python/export/ls_transformer_ptq_export.py b/examples/inference/python/export/ls_transformer_ptq_export.py
index ac4c77b0..6d0e1471 100644
--- a/examples/inference/python/export/ls_transformer_ptq_export.py
+++ b/examples/inference/python/export/ls_transformer_ptq_export.py
@@ -1,8 +1,8 @@
 """
-Export LightSeq fp16/fp32 Transformer models to int8 protobuf format,
-and then using int8 quantization to speedup inference.
+Export LightSeq Transformer models to int8 protobuf format using post training quantization.
 Refer to the `examples/training/custom` directory for more training details.
 """
+import argparse
 import time
 import numpy as np
 import torch
@@ -183,6 +183,19 @@ def ls_predict(ls_infer_model, src_tokens):
     return ls_output, end_time - start_time
 
 
+def parse_args():
+    parser = argparse.ArgumentParser(description="export LightSeq checkpoint", usage="")
+    parser.add_argument(
+        "--model",
+        "-m",
+        type=str,
+        default="checkpoint_best.pt",
+        help="path of LightSeq checkpoint",
+    )
+    args = parser.parse_args()
+    return args
+
+
 if __name__ == "__main__":
     (
         tokenizer,
@@ -200,10 +213,11 @@ def ls_predict(ls_infer_model, src_tokens):
         trg_seq_len,
     ) = create_data()
 
-    ckpt_path = "checkpoint.pt"
-    pb_path = "quant_transformer.pb"
+    args = parse_args()
+    model_name = ".".join(args.model.split(".")[:-1])
+    pb_path = f"{model_name}_ptq.pb"
 
-    with open(ckpt_path, "rb") as fin:
+    with open(args.model, "rb") as fin:
         state_dict = torch.load(fin, map_location=torch.device("cpu"))
 
     config = create_config(vocab_size)
diff --git a/examples/inference/python/export/util.py b/examples/inference/python/export/util.py
new file mode 100644
index 00000000..7ec3ac24
--- /dev/null
+++ b/examples/inference/python/export/util.py
@@ -0,0 +1,55 @@
+import argparse
+import tensorflow as tf
+import h5py
+
+from export.proto.transformer_pb2 import Transformer
+from lightseq.training import export_pb2hdf5
+from lightseq.training import export_quant_pb2hdf5
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="export fairseq checkpoint", usage="")
+    parser.add_argument(
+        "--model",
+        "-m",
+        type=str,
+        default="checkpoint_best.pt",
+        help="path of fairseq checkpoint",
+    )
+    parser.add_argument(
+        "--hdf5",
+        "-hdf5",
+        action="store_true",
+        help="whether to store hdf5",
+    )
+    parser.add_argument(
+        "--generation_method",
+        "-g",
+        type=str,
+        default="beam_search",
+        choices=["beam_search", "topk_greedy", "topk", "topp", "ppl"],
+        help="generation method",
+    )
+    args = parser.parse_args()
+    return args
+
+
+def save_model(transformer, pb_path, hdf5_path, hdf5):
+    if not hdf5:
+        try:
+            str_model = transformer.SerializeToString()
+            print("Writing to {0}".format(pb_path))
+            with tf.io.gfile.GFile(pb_path, "wb") as fout:
+                fout.write(str_model)
+            return pb_path
+        except:
+            pass
+
+    print("Writing to {0}".format(hdf5_path))
+    f = h5py.File(hdf5_path, "w")
+    if isinstance(transformer, Transformer):
+        export_pb2hdf5(transformer, f)
+    else:
+        export_quant_pb2hdf5(transformer, f)
+    f.close()
+    return hdf5_path
diff --git a/examples/inference/python/test/ls_bart.py b/examples/inference/python/test/ls_bart.py
index 7738f49c..2e667c44 100644
--- a/examples/inference/python/test/ls_bart.py
+++ b/examples/inference/python/test/ls_bart.py
@@ -71,6 +71,7 @@ def main():
     # change to "facebook/bart-large" for large model
     hf_model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
     hf_model.to("cuda:0")
+    hf_model.eval()
 
     sentences = [
         "I love that girl, but <mask> does not <mask> me.",
diff --git a/examples/inference/python/test/ls_bert.py b/examples/inference/python/test/ls_bert.py
index 7e3b0d4f..baa00a3c 100644
--- a/examples/inference/python/test/ls_bert.py
+++ b/examples/inference/python/test/ls_bert.py
@@ -76,6 +76,7 @@ def main():
     print("creating huggingface model...")
     hf_model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
     hf_model.to("cuda:0")
+    hf_model.eval()
 
     print("creating lightseq model...")
     ls_model = LightseqBertClassification("lightseq_bert_base_uncased.hdf5", hf_model)
diff --git a/examples/inference/python/test/ls_fairseq.sh b/examples/inference/python/test/ls_fairseq.sh
index bf6b4d75..9ff1a6d7 100644
--- a/examples/inference/python/test/ls_fairseq.sh
+++ b/examples/inference/python/test/ls_fairseq.sh
@@ -3,7 +3,7 @@
 until [[ -z "$1" ]]
 do
     case $1 in
-        --model)
+        -m)
             shift; MODEL=$1;
             shift;;
         *)
diff --git a/examples/inference/python/test/ls_gpt2.py b/examples/inference/python/test/ls_gpt2.py
index bc0f980b..abbd78a6 100644
--- a/examples/inference/python/test/ls_gpt2.py
+++ b/examples/inference/python/test/ls_gpt2.py
@@ -2,30 +2,62 @@
 import argparse
 
 import torch
-import numpy as np
 import lightseq.inference as lsi
 from transformers import GPT2Tokenizer, GPT2LMHeadModel
 
 
-def ls_gpt2(model, inputs):
+def ls_gpt2(model, inputs, generation_method="topk"):
     torch.cuda.synchronize()
     start_time = time.perf_counter()
-    generated_ids = model.sample(inputs)
+    results = None
+    if generation_method == "topk" or generation_method == "topp":
+        results = model.sample(inputs)
+    elif generation_method == "ppl":
+        results = model.ppl(inputs)[0]
     torch.cuda.synchronize()
     end_time = time.perf_counter()
-    return generated_ids, end_time - start_time
+    return results, end_time - start_time
 
 
-def hf_gpt2(model, inputs, tokenizer):
+def compute_hf_ppl(model, inputs):
+    max_length = 512
+    stride = 512
+    end_loc = 0
+
+    nlls = []
+    for i in range(0, inputs.size(1), stride):
+        begin_loc = max(i + stride - max_length, 0)
+        end_loc = min(i + stride, inputs.size(1))
+        trg_len = end_loc - i
+        input_ids = inputs[:, begin_loc:end_loc].to("cuda:0")
+        target_ids = input_ids.clone()
+        target_ids[:, :-trg_len] = -100
+
+        with torch.no_grad():
+            outputs = model(input_ids, labels=target_ids)
+            neg_log_likelihood = outputs[0] * trg_len
+
+        nlls.append(neg_log_likelihood)
+
+    ppl = torch.stack(nlls).sum() / end_loc
+    return ppl.cpu().numpy()
+
+
+def hf_gpt2(model, inputs, tokenizer, generation_method="topk"):
     inputs = inputs.to("cuda:0")
     torch.cuda.synchronize()
     start_time = time.perf_counter()
-    generated_ids = model.generate(
-        inputs, max_length=50, pad_token_id=tokenizer.eos_token_id
-    )
+    results = None
+    if generation_method == "topk" or generation_method == "topp":
+        results = model.generate(
+            inputs, max_length=50, pad_token_id=tokenizer.eos_token_id
+        )
+    elif generation_method == "ppl":
+        results = compute_hf_ppl(model, inputs)
+
     torch.cuda.synchronize()
     end_time = time.perf_counter()
-    return generated_ids, end_time - start_time
+    return results, end_time - start_time
 
 
 def ls_generate(model, tokenizer, inputs):
@@ -50,17 +82,49 @@ def hf_generate(model, tokenizer, inputs):
         print(sent)
 
 
-def warmup(ls_tokenizer, hf_tokenizer, ls_model, hf_model, sentences):
+def ls_ppl(model, tokenizer, inputs):
+    print("=========lightseq=========")
+    print("lightseq calculating ppl...")
+    ls_ppl, ls_time = ls_gpt2(model, inputs, "ppl")
+    print(f"lightseq time: {ls_time}s")
+    print("lightseq results:")
+    print(ls_ppl)
+
+
+def hf_ppl(model, tokenizer, inputs):
+    print("=========huggingface=========")
+    print("huggingface calculating ppl...")
+    hf_ppl, hf_time = hf_gpt2(model, inputs, tokenizer, "ppl")
+    print(f"huggingface time: {hf_time}s")
+    print("huggingface results:")
+    print(hf_ppl)
+
+
+def warmup(
+    ls_tokenizer, hf_tokenizer, ls_model, hf_model, sentences, generation_method
+):
     ls_inputs = ls_tokenizer(sentences, return_tensors="pt", padding=True)["input_ids"]
     hf_inputs = hf_tokenizer(sentences, return_tensors="pt", padding=True)["input_ids"]
 
-    ls_generate(ls_model, ls_tokenizer, ls_inputs)
-    hf_generate(hf_model, hf_tokenizer, hf_inputs)
+    if generation_method == "topk" or generation_method == "topp":
+        ls_generate(ls_model, ls_tokenizer, ls_inputs)
+        hf_generate(hf_model, hf_tokenizer, hf_inputs)
+    elif generation_method == "ppl":
+        ls_ppl(ls_model, ls_tokenizer, ls_inputs)
+        hf_ppl(hf_model, hf_tokenizer, hf_inputs)
 
 
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--user_input", action="store_true")
+    parser.add_argument(
+        "--generation_method",
+        "-g",
+        type=str,
+        default="topk",
+        choices=["topk", "topp", "ppl"],
+        help="generation method",
+    )
     args = parser.parse_args()
 
     print("initializing gpt tokenizer...")
@@ -81,18 +145,26 @@ def main():
     print("creating huggingface model...")
     hf_model = GPT2LMHeadModel.from_pretrained("gpt2")
     hf_model.to("cuda:0")
+    hf_model.eval()
 
     # lightseq gpt perplexity supports batch infer with different lengths,
     # but sampling doesn't support
     sentences = [
-        "My name is GPT",
-        "My name is GPT",
-        "My name is GPT",
-        "My name is GPT",
+        "I love you, but you say that",
+        "I love you, but you say that",
+        "I love you, but you say that",
+        "I love you, but you say that",
     ]
 
     print("====================START warmup====================")
-    warmup(ls_tokenizer, hf_tokenizer, ls_model, hf_model, sentences)
+    warmup(
+        ls_tokenizer,
+        hf_tokenizer,
+        ls_model,
+        hf_model,
+        sentences,
+        args.generation_method,
+    )
     print("====================END warmup====================")
 
     while True:
@@ -108,8 +180,12 @@ def main():
             "input_ids"
         ]
 
-        ls_generate(ls_model, ls_tokenizer, ls_inputs)
-        hf_generate(hf_model, hf_tokenizer, hf_inputs)
+        if args.generation_method == "topk" or args.generation_method == "topp":
+            ls_generate(ls_model, ls_tokenizer, ls_inputs)
+            hf_generate(hf_model, hf_tokenizer, hf_inputs)
+        elif args.generation_method == "ppl":
+            ls_ppl(ls_model, ls_tokenizer, ls_inputs)
+            hf_ppl(hf_model, hf_tokenizer, hf_inputs)
 
         if not args.user_input:
             break
diff --git a/examples/inference/python/test/ls_quant_bert.py b/examples/inference/python/test/ls_quant_bert.py
new file mode 100644
index 00000000..29046866
--- /dev/null
+++ b/examples/inference/python/test/ls_quant_bert.py
@@ -0,0 +1,176 @@
+import time
+
+import torch
+from transformers import BertTokenizer, BertForTokenClassification, BertConfig
+import lightseq.inference as lsi
+from lightseq.training.ops.pytorch.quantization import qat_mode
+from lightseq.training.ops.pytorch.torch_transformer_layers import (
+    BertEmbeddingLayer,
+    TransformerEncoderLayer,
+)
+from export.util import parse_args
+
+
+def ls_bert(model, inputs):
+    torch.cuda.synchronize()
+    start_time = time.perf_counter()
+    ls_output = model.infer(inputs)
+    torch.cuda.synchronize()
+    end_time = time.perf_counter()
+    return ls_output, end_time - start_time
+
+
+def hf_bert(model, inputs, attn_mask):
+    torch.cuda.synchronize()
+    start_time = time.perf_counter()
+    hf_output = model(inputs.to("cuda:0"), attention_mask=attn_mask.to("cuda:0"))
+    torch.cuda.synchronize()
+    end_time = time.perf_counter()
+    return hf_output, end_time - start_time
+
+
+def ls_generate(model, inputs_id):
+    print("=========lightseq=========")
+    print("lightseq generating...")
+    ls_output, ls_time = ls_bert(model, inputs_id)
+    print(f"lightseq time: {ls_time}s")
+    print("lightseq results (class predictions):")
+    print(ls_output.argmax(axis=2).detach().cpu().numpy())
+
+
+def hf_generate(model, inputs_id, attn_mask):
+    print("=========huggingface=========")
+    print("huggingface generating...")
+    hf_output, hf_time = hf_bert(model, inputs_id, attn_mask)
+    print(f"huggingface time: {hf_time}s")
+    print("huggingface results (class predictions):")
+    print(hf_output.logits.argmax(axis=2).detach().cpu().numpy())
+
+
+def warmup(tokenizer, ls_model, hf_model, sentences):
+    inputs = tokenizer(sentences, return_tensors="pt", padding=True)
+    inputs_id = inputs["input_ids"]
+    attn_mask = inputs["attention_mask"]
+
+    ls_generate(ls_model, inputs_id)
+    hf_generate(hf_model, inputs_id, attn_mask)
+
+
+class LightseqBertClassification:
+    def __init__(self, ls_weight_path, hf_model):
+        self.ls_bert = lsi.QuantBert(ls_weight_path, 8)
+        self.classifier = hf_model.classifier
+
+    def infer(self, inputs):
+        last_hidden_states = self.ls_bert.infer(inputs)
+        last_hidden_states = torch.Tensor(last_hidden_states).float()
+        logits = self.classifier(last_hidden_states.to("cuda:0"))
+        return logits
+
+
+def gen_bert_emb_config(config):
+    bert_emb_config = BertEmbeddingLayer.get_config(
+        vocab_size=config.vocab_size,
+        embedding_dim=config.hidden_size,
+        max_batch_tokens=4096,
+        max_seq_len=config.max_position_embeddings,
+        padding_idx=config.pad_token_id,
+        dropout=config.hidden_dropout_prob,
+        fp16=True,
+        local_rank=0,
+    )
+    bert_emb_config.type_vocab_size = config.type_vocab_size
+    bert_emb_config.layer_norm_eps = config.layer_norm_eps
+    return bert_emb_config
+
+
+class LSHFTransformerEncoderLayer(TransformerEncoderLayer):
+    def __init__(self, *args, **kwargs):
+        super(LSHFTransformerEncoderLayer, self).__init__(*args, **kwargs)
+
+    def forward(self, hidden_states, encoder_padding_mask, *args, **kwargs):
+        ls_encoder_padding_mask = encoder_padding_mask / -10000.0
+        ls_encoder_padding_mask = ls_encoder_padding_mask.squeeze()
+        output = super().forward(hidden_states, ls_encoder_padding_mask)
+        return (output, None, None, None)
+
+
+def gen_bert_enc_config(config):
+    bert_enc_config = TransformerEncoderLayer.get_config(
+        max_batch_tokens=4096,
+        max_seq_len=config.max_position_embeddings,
+        hidden_size=config.hidden_size,
+        intermediate_size=config.intermediate_size,
+        nhead=config.num_attention_heads,
+        attn_prob_dropout_ratio=config.attention_probs_dropout_prob,
+        activation_dropout_ratio=config.hidden_dropout_prob,
+        hidden_dropout_ratio=config.hidden_dropout_prob,
+        pre_layer_norm=False,
+        fp16=True,
+        local_rank=0,
+        activation_fn="gelu",
+    )
+    return bert_enc_config
+
+
+def inject_ls_layer(model, config):
+    bert_emb_config = gen_bert_emb_config(config)
+    model.bert.embeddings = BertEmbeddingLayer(bert_emb_config)
+    model.bert.embeddings.apply(qat_mode)
+
+    for i in range(config.num_hidden_layers):
+        bert_enc_config = gen_bert_enc_config(config)
+        model.bert.encoder.layer[i] = LSHFTransformerEncoderLayer(
+            bert_enc_config
+        ).cuda()
+        model.bert.encoder.layer[i].apply(qat_mode)
+
+
+def main():
+    args = parse_args()
+    model_name = ".".join(args.model.split(".")[:-1])
+    ckpt_path = f"{model_name}.bin"
+
+    print("initializing bert config...")
+    config = BertConfig.from_pretrained(
+        "bert-base-uncased", num_labels=9, finetuning_task="ner"
+    )
+
+    print("initializing bert tokenizer...")
+    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+
+    print("creating huggingface model...")
+    hf_model = BertForTokenClassification.from_pretrained(
+        "bert-base-uncased", config=config
+    )
+    inject_ls_layer(hf_model, config)
+    state_dict = torch.load(ckpt_path, map_location="cpu")
+    hf_model.load_state_dict(state_dict, strict=False)
+    hf_model.to("cuda:0")
+    hf_model.eval()
+
+    print("creating lightseq model...")
+    ls_model = LightseqBertClassification(args.model, hf_model)
+
+    sentences = [
+        "EU rejects German call to boycott British lamb .",
+        "-- Dimitris Kontogiannis , Athens Newsroom +301 3311812-4",
+        "BayerVB sets C$ 100 million six-year bond .",
+        "China says time right for Taiwan talks .",
+    ]
+
+    print("====================START warmup====================")
+    warmup(tokenizer, ls_model, hf_model, sentences)
+    print("====================END warmup====================")
+
+    print("tokenizing the sentences...")
+    inputs = tokenizer(sentences, return_tensors="pt", padding=True)
+    inputs_id = inputs["input_ids"]
+    attn_mask = inputs["attention_mask"]
+
+    ls_generate(ls_model, inputs_id)
+    hf_generate(hf_model, inputs_id, attn_mask)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/inference/python/test/ls_quant_gpt2.py b/examples/inference/python/test/ls_quant_gpt2.py
new file mode 100644
index 00000000..033ac5b4
--- /dev/null
+++ b/examples/inference/python/test/ls_quant_gpt2.py
@@ -0,0 +1,251 @@
+import time
+
+import torch
+from torch import nn
+from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config
+import lightseq.inference as lsi
+from lightseq.training.ops.pytorch.quantization import (
+    qat_mode,
+    QuantLinear,
+    TensorQuantizer,
+    weight_quant_config,
+)
+from lightseq.training.ops.pytorch.torch_transformer_layers import (
+    TransformerDecoderLayer,
+)
+from export.util import parse_args
+
+
+def ls_gpt2(model, inputs, generation_method="topk"):
+    torch.cuda.synchronize()
+    start_time = time.perf_counter()
+    results = None
+    if generation_method == "topk" or generation_method == "topp":
+        results = model.sample(inputs)
+    elif generation_method == "ppl":
+        results = model.ppl(inputs)[0]
+    torch.cuda.synchronize()
+    end_time = time.perf_counter()
+    return results, end_time - start_time
+
+
+def compute_hf_ppl(model, inputs):
+    max_length = 512
+    stride = 512
+    end_loc = 0
+
+    nlls = []
+    for i in range(0, inputs.size(1), stride):
+        begin_loc = max(i + stride - max_length, 0)
+        end_loc = min(i + stride, inputs.size(1))
+        trg_len = end_loc - i
+        input_ids = inputs[:, begin_loc:end_loc].to("cuda:0")
+        target_ids = input_ids.clone()
+        target_ids[:, :-trg_len] = -100
+
+        with torch.no_grad():
+            outputs = model(input_ids, labels=target_ids)
+            neg_log_likelihood = outputs[0] * trg_len
+
+        nlls.append(neg_log_likelihood)
+
+    ppl = torch.stack(nlls).sum() / end_loc
+    return ppl.cpu().numpy()
+
+
+def hf_gpt2(model, inputs, tokenizer, generation_method="topk"):
+    inputs = inputs.to("cuda:0")
+    torch.cuda.synchronize()
+    start_time = time.perf_counter()
+    results = None
+    if generation_method == "topk" or generation_method == "topp":
+        results = model.generate(
+            inputs, max_length=50, pad_token_id=tokenizer.eos_token_id
+        )
+    elif generation_method == "ppl":
+        results = compute_hf_ppl(model, inputs)
+
+    torch.cuda.synchronize()
+    end_time = time.perf_counter()
+    return results, end_time - start_time
+
+
+def ls_generate(model, tokenizer, inputs):
+    print("=========lightseq=========")
+    print("lightseq generating...")
+    ls_res_ids, ls_time = ls_gpt2(model, inputs)
+    ls_res = tokenizer.batch_decode(ls_res_ids, skip_special_tokens=True)
+    print(f"lightseq time: {ls_time}s")
+    print("lightseq results:")
+    for sent in ls_res:
+        print(sent)
+
+
+def hf_generate(model, tokenizer, inputs):
+    print("=========huggingface=========")
+    print("huggingface generating...")
+    hf_res_ids, hf_time = hf_gpt2(model, inputs, tokenizer)
+    hf_res = tokenizer.batch_decode(hf_res_ids, skip_special_tokens=True)
+    print(f"huggingface time: {hf_time}s")
+    print("huggingface results:")
+    for sent in hf_res:
+        print(sent)
+
+
+def ls_ppl(model, tokenizer, inputs):
+    print("=========lightseq=========")
+    print("lightseq calculating ppl...")
+    ls_ppl, ls_time = ls_gpt2(model, inputs, "ppl")
+    print(f"lightseq time: {ls_time}s")
+    print("lightseq results:")
+    print(ls_ppl)
+
+
+def hf_ppl(model, tokenizer, inputs):
+    print("=========huggingface=========")
+    print("huggingface calculating ppl...")
+    hf_ppl, hf_time = hf_gpt2(model, inputs, tokenizer, "ppl")
+    print(f"huggingface time: {hf_time}s")
+    print("huggingface results:")
+    print(hf_ppl)
+
+
+def warmup(
+    ls_tokenizer, hf_tokenizer, ls_model, hf_model, sentences, generation_method
+):
+    ls_inputs = ls_tokenizer(sentences, return_tensors="pt", padding=True)["input_ids"]
+    hf_inputs = hf_tokenizer(sentences, return_tensors="pt", padding=True)["input_ids"]
+
+    if generation_method == "topk" or generation_method == "topp":
+        ls_generate(ls_model, ls_tokenizer, ls_inputs)
+        # hf_generate(hf_model, hf_tokenizer, hf_inputs)
+    elif generation_method == "ppl":
+        ls_ppl(ls_model, ls_tokenizer, ls_inputs)
+        hf_ppl(hf_model, hf_tokenizer, hf_inputs)
+
+
+class GptEmbedding(nn.Embedding):
+    def __init__(self, *args, **kwargs):
+        super(GptEmbedding, self).__init__(*args, **kwargs)
+        self.emb_quant = TensorQuantizer(weight_quant_config)
+
+    def forward(self, input_ids):
+        x = super(GptEmbedding, self).forward(input_ids)
+        x = self.emb_quant(x)
+        return x
+
+
+def gen_gpt_enc_config(config):
+    gpt_enc_config = TransformerDecoderLayer.get_config(
+        max_batch_tokens=8192,
+        max_seq_len=config.max_position_embeddings,
+        hidden_size=config.hidden_size,
+        intermediate_size=4 * config.hidden_size,
+        nhead=config.num_attention_heads,
+        attn_prob_dropout_ratio=config.attn_pdrop,
+        activation_dropout_ratio=config.resid_pdrop,
+        hidden_dropout_ratio=config.resid_pdrop,
+        pre_layer_norm=True,
+        fp16=True,
+        local_rank=0,
+        nlayer=config.num_hidden_layers,
+        activation_fn="gelu",
+        has_cross_attn=False,
+    )
+    return gpt_enc_config
+
+
+class LSHFGptEncoderLayer(TransformerDecoderLayer):
+    def __init__(self, *args, **kwargs):
+        super(LSHFGptEncoderLayer, self).__init__(*args, **kwargs)
+
+    def forward(self, hidden_states, attention_mask=None, *args, **kwargs):
+        if attention_mask is not None:
+            ls_attention_mask = attention_mask.squeeze()
+        else:
+            ls_attention_mask = torch.zeros(hidden_states.size()[:2])
+        output = super().forward(hidden_states, ls_attention_mask)
+        return output
+
+
+def inject_ls_layer(model, config):
+    model.transformer.wte = GptEmbedding(config.vocab_size, config.hidden_size)
+    model.transformer.wte.apply(qat_mode)
+
+    for i in range(config.num_hidden_layers):
+        gpt_enc_config = gen_gpt_enc_config(config)
+        model.transformer.h[i] = LSHFGptEncoderLayer(gpt_enc_config).cuda()
+        model.transformer.h[i].apply(qat_mode)
+
+    q_lm_head = QuantLinear(config.n_embd, config.vocab_size, bias=False)
+    q_lm_head.weight = model.transformer.wte.weight
+    q_lm_head.weight_quant = model.transformer.wte.emb_quant
+    model.lm_head = q_lm_head
+
+
+def main():
+    args = parse_args()
+    if args.generation_method not in ["topk", "topp", "ppl"]:
+        args.generation_method = "topk"
+    model_name = ".".join(args.model.split(".")[:-1])
+    ckpt_path = f"{model_name}.bin"
+
+    print("initializing gpt2 config...")
+    config = GPT2Config.from_pretrained("gpt2")
+
+    print("initializing gpt2 tokenizer...")
+    ls_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+    # lightseq use len(tokenizer) as pad_token in default
+    ls_tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+    print(f"lightseq tokenizer pad token id: {ls_tokenizer.pad_token_id}")
+
+    hf_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+    # use EOS as PAD for huggingface to avoid warning according to https://huggingface.co/blog/how-to-generate while avoid reshaping the model embedding
+    hf_tokenizer.pad_token = hf_tokenizer.eos_token
+    print(f"huggingface tokenizer pad token id: {hf_tokenizer.pad_token_id}")
+
+    print("creating huggingface model...")
+    hf_model = GPT2LMHeadModel.from_pretrained("gpt2", config=config)
+    inject_ls_layer(hf_model, config)
+    state_dict = torch.load(ckpt_path, map_location="cpu")
+    hf_model.load_state_dict(state_dict, strict=False)
+    hf_model.to("cuda:0")
+    hf_model.eval()
+
+    print("creating lightseq model...")
+    ls_model = lsi.QuantGpt(args.model, max_batch_size=16)
+
+    # lightseq gpt perplexity supports batch infer with different lengths,
+    # but sampling doesn't support
+    sentences = [
+        "I love you, but you say that",
+        "I love you, but you say that",
+        "I love you, but you say that",
+        "I love you, but you say that",
+    ]
+
+    print("====================START warmup====================")
+    warmup(
+        ls_tokenizer,
+        hf_tokenizer,
+        ls_model,
+        hf_model,
+        sentences,
+        args.generation_method,
+    )
+    print("====================END warmup====================")
+
+    print("tokenizing the sentences...")
+    ls_inputs = ls_tokenizer(sentences, return_tensors="pt", padding=True)["input_ids"]
+    hf_inputs = hf_tokenizer(sentences, return_tensors="pt", padding=True)["input_ids"]
+
+    if args.generation_method == "topk" or args.generation_method == "topp":
+        ls_generate(ls_model, ls_tokenizer, ls_inputs)
+        # hf_generate(hf_model, hf_tokenizer, hf_inputs)
+    elif args.generation_method == "ppl":
+        ls_ppl(ls_model, ls_tokenizer, ls_inputs)
+        hf_ppl(hf_model, hf_tokenizer, hf_inputs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/training/custom/README.md b/examples/training/custom/README.md
index be38ed7d..27494f33 100644
--- a/examples/training/custom/README.md
+++ b/examples/training/custom/README.md
@@ -6,7 +6,7 @@ The source inputs of the encoder are batch of sentences and the target outputs o
 
 You can run the example simplely by:
 ```shell
-python examples/training/custom/run.py
+$ python examples/training/custom/run.py
 ```
 
 If it runs successfully, you will see the following output:
diff --git a/examples/training/deepspeed/README.md b/examples/training/deepspeed/README.md
index ac078949..63c88ff3 100644
--- a/examples/training/deepspeed/README.md
+++ b/examples/training/deepspeed/README.md
@@ -3,12 +3,12 @@ This repo contains an example for how to use LightSeq to accerate the training o
 
 First you should install these requirements.
 ```shell
-pip install torch ninja fairseq deepspeed
+$ pip install torch ninja fairseq deepspeed
 ```
 
 Then you can train a translation task on wmt14 en2de dataset by running the following script:
 ```shell
-sh examples/training/deepspeed/ds_fairseq_wmt14en2de.sh
+$ sh examples/training/deepspeed/ds_fairseq_wmt14en2de.sh
 ```
 
 This script firstly download the dataset, and then run native Fairseq training script using DeepSpeed launcher without any other parameter modifications.
diff --git a/examples/training/fairseq/README.md b/examples/training/fairseq/README.md
index 623ad511..093ddfe7 100644
--- a/examples/training/fairseq/README.md
+++ b/examples/training/fairseq/README.md
@@ -1,26 +1,32 @@
 # LightSeq for Fairseq
-This repo contains an example for how to use LightSeq to accerate the training of translation task in [Fairseq](https://github.com/pytorch/fairseq).
+This repo contains examples for how to use LightSeq to accerate the training of translation task in [Fairseq](https://github.com/pytorch/fairseq).
 
 First you should install these requirements.
 ```shell
-pip install lightseq fairseq sacremoses
+$ pip install lightseq fairseq sacremoses
 ```
 
 ## Train
-Then you can train a translation task on wmt14 en2de dataset by running the following script:
+Then you can train a translation task on wmt14 en2de dataset using LightSeq by running the following script:
 ```shell
-sh examples/training/fairseq/ls_fairseq_wmt14en2de.sh
+$ sh examples/training/fairseq/ls_fairseq_wmt14en2de.sh
 ```
 
 Or you can use LightSeq modules like `--arch ls_transformer_wmt_en_de_big_t2t`,
 by adding `--user-dir=${LIGHTSEQ_DIR}/lightseq/training/cli/fs_modules`
 to `fairseq-train`.
 
+You can use `--use-torch-layer` to replace LightSeq layers with custom Torch layers based on native Fairseq layers.
+
+You can use `--enable-quant` and `--quant-mode qat` to run quantization aware training for subsequent LightSeq fast int8 inference.
+
 This script firstly download the dataset and then run native Fairseq
 training script using optimized model and optimizer.
 The `lightseq-train` command is just a easy-to-use wrapper of `fairseq-train` with adding
 LightSeq to `--user-dir`.
 
+We also provide other training scripts to support custom Torch layers and quantization. All model files have been publicly released. **Refer to [examples/inference/python/README.md](../../../examples/inference/python/README.md) for more training, export and inference details.**
+
 LightSeq can achieve about 1.47x speedup using batch size 4096 on 8 V100 GPUs,
 compared with original Fairseq implementation. You can delete the `ls` prefix in parameters
 to switch to fairseq modules.
@@ -28,7 +34,7 @@ to switch to fairseq modules.
 ## Evaluation
 Then you can evaluate on wmt14 en2de dataset by running the following command:
 ```shell
-lightseq-validate /tmp/wmt14_en_de/ \
+$ lightseq-validate /tmp/wmt14_en_de/ \
     --valid-subset valid \
     --path checkpoints/checkpoint_best.pt \
     --task translation \
@@ -41,11 +47,11 @@ lightseq-validate /tmp/wmt14_en_de/ \
 ## Generate
 You can also generate on wmt14 en2de dataset by running the following command:
 ```shell
-lightseq-generate /tmp/wmt14_en_de/ \
+$ lightseq-generate /tmp/wmt14_en_de/ \
     --gen-subset test \
     --path checkpoints/checkpoint_best.pt \
     --task translation \
-    --max-tokens 8192 \
+    --batch-size 128 \
     --beam 4 \
     --lenpen 0.6 \
     --fp16 \
diff --git a/examples/training/huggingface/README.md b/examples/training/huggingface/README.md
deleted file mode 100644
index d8686202..00000000
--- a/examples/training/huggingface/README.md
+++ /dev/null
@@ -1,14 +0,0 @@
-# LightSeq for HuggingFace
-
-This repo contains an example for how to use LightSeq to accerate the training of BERT in HuggingFace [Transformers](https://github.com/huggingface/transformers).
-
-We modify the token classification [examples](https://github.com/huggingface/transformers/tree/master/examples/pytorch/token-classification) in HuggingFace Transformers by replacing their encoder layers with the fused ones in LightSeq.
-
-First you should install these requirements.
-
-```shell
-pip install torch ninja transformers seqeval datasets
-```
-
-Then you can easily fine-tunes BERT on CoNLL-2003 by running the bash script `run_ner.sh`
-or on GLUE by `run_glue.sh`. From our tests, speedup is about 1.6x .
diff --git a/examples/training/huggingface/bert/README.md b/examples/training/huggingface/bert/README.md
new file mode 100644
index 00000000..77dde9aa
--- /dev/null
+++ b/examples/training/huggingface/bert/README.md
@@ -0,0 +1,19 @@
+# LightSeq for HuggingFace BERT
+
+This repo contains an example for how to use LightSeq to accerate the training of BERT in HuggingFace [Transformers](https://github.com/huggingface/transformers).
+
+We modify the examples like token classification [examples](https://github.com/huggingface/transformers/tree/master/examples/pytorch/token-classification) in HuggingFace Transformers by replacing their encoder layers with the fused ones in LightSeq.
+
+First you should install these requirements.
+
+```shell
+$ pip install torch ninja transformers seqeval datasets
+```
+
+Before doing next training, you need to switch to the current directory:
+```shell
+$ cd examples/training/huggingface/bert
+```
+
+Then you can easily fine-tunes BERT on different tasks by running the bash scripts `task_ner/run_ner.sh`
+, `task_glue/run_glue.sh`, `task_qa/run_qa.sh`, etc. From our tests, speedup is about 1.6x.
diff --git a/examples/training/huggingface/__init__.py b/examples/training/huggingface/bert/__init__.py
similarity index 100%
rename from examples/training/huggingface/__init__.py
rename to examples/training/huggingface/bert/__init__.py
diff --git a/examples/training/huggingface/bert/ls_hf_transformer_layer.py b/examples/training/huggingface/bert/ls_hf_transformer_layer.py
new file mode 100644
index 00000000..6ad9b8d8
--- /dev/null
+++ b/examples/training/huggingface/bert/ls_hf_transformer_layer.py
@@ -0,0 +1,116 @@
+from lightseq.training.ops.pytorch.quantization import qat_mode, disable_quant
+from lightseq.training.ops.pytorch.torch_transformer_layers import BertEmbeddingLayer
+
+
+def get_hf_bert_enc_layer_params(layer):
+    init_ws = []
+    init_bs = []
+
+    init_ws.append(layer.attention.self.query.weight.detach().clone())
+    init_bs.append(layer.attention.self.query.bias.detach().clone())
+    init_ws.append(layer.attention.self.key.weight.detach().clone())
+    init_bs.append(layer.attention.self.key.bias.detach().clone())
+    init_ws.append(layer.attention.self.value.weight.detach().clone())
+    init_bs.append(layer.attention.self.value.bias.detach().clone())
+    init_ws.append(layer.attention.output.dense.weight.detach().clone())
+    init_bs.append(layer.attention.output.dense.bias.detach().clone())
+    init_ws.append(layer.attention.output.LayerNorm.weight.detach().clone())
+    init_bs.append(layer.attention.output.LayerNorm.bias.detach().clone())
+
+    init_ws.append(layer.intermediate.dense.weight.detach().clone())
+    init_bs.append(layer.intermediate.dense.bias.detach().clone())
+    init_ws.append(layer.output.dense.weight.detach().clone())
+    init_bs.append(layer.output.dense.bias.detach().clone())
+    init_ws.append(layer.output.LayerNorm.weight.detach().clone())
+    init_bs.append(layer.output.LayerNorm.bias.detach().clone())
+
+    return init_ws, init_bs
+
+
+def get_hf_bert_emb_layer_params(layer):
+    init_ws = []
+
+    init_ws.append(layer.word_embeddings.weight.detach().clone())
+    init_ws.append(layer.position_embeddings.weight.detach().clone())
+    init_ws.append(layer.token_type_embeddings.weight.detach().clone())
+    init_ws.append(layer.LayerNorm.weight.detach().clone())
+    init_ws.append(layer.LayerNorm.bias.detach().clone())
+
+    return init_ws
+
+
+def gen_bert_emb_config(training_args, config):
+    bert_emb_config = BertEmbeddingLayer.get_config(
+        vocab_size=config.vocab_size,
+        embedding_dim=config.hidden_size,
+        max_batch_tokens=4096,
+        max_seq_len=config.max_position_embeddings,
+        padding_idx=config.pad_token_id,
+        dropout=config.hidden_dropout_prob,
+        fp16=training_args.fp16,
+        local_rank=training_args.local_rank,
+    )
+    bert_emb_config.type_vocab_size = config.type_vocab_size
+    bert_emb_config.layer_norm_eps = config.layer_norm_eps
+    return bert_emb_config
+
+
+def inject_ls_layer(model, training_args, model_args, config):
+    if model_args.module_type == 2:
+        from lightseq.training.ops.pytorch.torch_transformer_layers import (
+            TransformerEncoderLayer,
+        )
+    elif model_args.module_type == 1:
+        from lightseq.training.ops.pytorch.transformer_encoder_layer import (
+            LSTransformerEncoderLayer as TransformerEncoderLayer,
+        )
+    else:
+        raise NotImplementedError
+
+    if model_args.module_type == 2:
+        bert_emb_config = gen_bert_emb_config(training_args, config)
+        init_ws = get_hf_bert_emb_layer_params(model.bert.embeddings)
+        model.bert.embeddings = BertEmbeddingLayer(bert_emb_config, init_ws)
+        if model_args.enable_quant:
+            model.bert.embeddings.apply(qat_mode)
+        else:
+            model.bert.embeddings.apply(disable_quant)
+
+    class LSHFTransformerEncoderLayer(TransformerEncoderLayer):
+        def __init__(self, *args, **kwargs):
+            super(LSHFTransformerEncoderLayer, self).__init__(*args, **kwargs)
+
+        def forward(self, hidden_states, encoder_padding_mask, *args, **kwargs):
+            ls_encoder_padding_mask = encoder_padding_mask / -10000.0
+            ls_encoder_padding_mask = ls_encoder_padding_mask.squeeze()
+            output = super().forward(hidden_states, ls_encoder_padding_mask)
+            return (output, None, None, None)
+
+    def gen_bert_enc_config(training_args, config):
+        bert_enc_config = TransformerEncoderLayer.get_config(
+            max_batch_tokens=4096,
+            max_seq_len=config.max_position_embeddings,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            nhead=config.num_attention_heads,
+            attn_prob_dropout_ratio=config.attention_probs_dropout_prob,
+            activation_dropout_ratio=config.hidden_dropout_prob,
+            hidden_dropout_ratio=config.hidden_dropout_prob,
+            pre_layer_norm=False,
+            fp16=training_args.fp16,
+            local_rank=training_args.local_rank,
+            activation_fn="gelu",
+        )
+        return bert_enc_config
+
+    for i in range(config.num_hidden_layers):
+        bert_enc_config = gen_bert_enc_config(training_args, config)
+        init_ws, init_bs = get_hf_bert_enc_layer_params(model.bert.encoder.layer[i])
+        model.bert.encoder.layer[i] = LSHFTransformerEncoderLayer(
+            bert_enc_config, init_ws, init_bs
+        ).cuda()
+        if model_args.module_type == 2:
+            if model_args.enable_quant:
+                model.bert.encoder.layer[i].apply(qat_mode)
+            else:
+                model.bert.encoder.layer[i].apply(disable_quant)
diff --git a/examples/training/huggingface/run_glue.py b/examples/training/huggingface/bert/task_glue/run_glue.py
similarity index 98%
rename from examples/training/huggingface/run_glue.py
rename to examples/training/huggingface/bert/task_glue/run_glue.py
index 1a2274da..0b3b62ca 100644
--- a/examples/training/huggingface/run_glue.py
+++ b/examples/training/huggingface/bert/task_glue/run_glue.py
@@ -45,7 +45,7 @@
 from transformers.trainer_utils import get_last_checkpoint
 from transformers.utils import check_min_version
 from transformers.utils.versions import require_version
-from ls_hf_transformer_encoder_layer import inject_ls_enc_layer
+from ls_hf_transformer_layer import inject_ls_layer
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
@@ -224,9 +224,15 @@ class ModelArguments:
             "with private models)."
         },
     )
-    with_lightseq: bool = field(
-        default=True,
-        metadata={"help": "Whether to use lightseq TransformerEncoder"},
+    module_type: int = field(
+        default=1,
+        metadata={
+            "help": "0: original Hugging Face layer, 1: LightSeq CUDA layer, 2: custom Torch layer"
+        },
+    )
+    enable_quant: bool = field(
+        default=False,
+        metadata={"help": "Whether to enable quantization"},
     )
 
 
@@ -410,8 +416,8 @@ def main():
     )
 
     # Replace with LightSeq encoder layers.
-    if model_args.with_lightseq:
-        inject_ls_enc_layer(model, training_args, config)
+    if model_args.module_type == 1 or model_args.module_type == 2:
+        inject_ls_layer(model, training_args, model_args, config)
 
     # Preprocessing the datasets
     if data_args.task_name is not None:
diff --git a/examples/training/huggingface/run_glue.sh b/examples/training/huggingface/bert/task_glue/run_glue.sh
similarity index 88%
rename from examples/training/huggingface/run_glue.sh
rename to examples/training/huggingface/bert/task_glue/run_glue.sh
index 84fa3c38..a7756ab2 100644
--- a/examples/training/huggingface/run_glue.sh
+++ b/examples/training/huggingface/bert/task_glue/run_glue.sh
@@ -15,22 +15,23 @@
 
 THIS_DIR=$(dirname $(readlink -f $0))
 
-export TASK_NAME=stsb
+export TASK_NAME=sst2
 
 python3 -m torch.distributed.launch \
   --nproc_per_node=1 \
   $THIS_DIR/run_glue.py \
-  --model_name_or_path bert-large-cased \
+  --model_name_or_path bert-base-cased \
   --task_name $TASK_NAME \
   --do_train \
   --do_eval \
   --max_seq_length 128 \
   --per_device_train_batch_size 32 \
   --learning_rate 2e-5 \
-  --num_train_epochs 3 \
+  --num_train_epochs 10 \
   --output_dir /tmp/$TASK_NAME/ \
   --overwrite_output_dir \
-  --with_lightseq true \
   --fp16 \
   --seed 1234 \
   --logging_steps 10 \
+  --module_type 2 \
+  --enable_quant false
diff --git a/examples/training/huggingface/bert/task_glue/run_quant_glue.sh b/examples/training/huggingface/bert/task_glue/run_quant_glue.sh
new file mode 100644
index 00000000..d60e9233
--- /dev/null
+++ b/examples/training/huggingface/bert/task_glue/run_quant_glue.sh
@@ -0,0 +1,38 @@
+# Copyright 2021 The LightSeq Team
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+THIS_DIR=$(dirname $(readlink -f $0))
+
+export TASK_NAME=sst2
+
+python3 -m torch.distributed.launch \
+  --nproc_per_node=1 \
+  $THIS_DIR/run_glue.py \
+  --model_name_or_path bert-base-cased \
+  --task_name $TASK_NAME \
+  --do_train \
+  --do_eval \
+  --max_seq_length 128 \
+  --per_device_train_batch_size 32 \
+  --learning_rate 2e-6 \
+  --num_train_epochs 20 \
+  --output_dir /tmp/quant/$TASK_NAME/ \
+  --overwrite_output_dir \
+  --resume_from_checkpoint /tmp/$TASK_NAME/ \
+  --fp16 \
+  --seed 1234 \
+  --logging_steps 10 \
+  --module_type 2 \
+  --enable_quant true
diff --git a/examples/training/huggingface/bert/task_ner/predict_quant_ner.sh b/examples/training/huggingface/bert/task_ner/predict_quant_ner.sh
new file mode 100644
index 00000000..df81783e
--- /dev/null
+++ b/examples/training/huggingface/bert/task_ner/predict_quant_ner.sh
@@ -0,0 +1,42 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+until [[ -z "$1" ]]
+do
+    case $1 in
+        -m)
+            shift; MODEL=$1;
+            shift;;
+        *)
+            shift;;
+    esac
+done
+
+THIS_DIR=$(dirname $(readlink -f $0))
+
+python3 -m torch.distributed.launch \
+  --nproc_per_node=1 \
+  $THIS_DIR/run_ner.py \
+  --model_name_or_path bert-base-uncased \
+  --dataset_name conll2003 \
+  --do_predict \
+  --per_device_train_batch_size 4 \
+  --output_dir /tmp/quant/test-ner \
+  --overwrite_output_dir \
+  --resume_from_checkpoint $MODEL \
+  --fp16 \
+  --seed 1234 \
+  --logging_steps 10 \
+  --module_type 2 \
+  --enable_quant true
diff --git a/examples/training/huggingface/run_ner.py b/examples/training/huggingface/bert/task_ner/run_ner.py
similarity index 96%
rename from examples/training/huggingface/run_ner.py
rename to examples/training/huggingface/bert/task_ner/run_ner.py
index 1f287bfd..41db6c1d 100644
--- a/examples/training/huggingface/run_ner.py
+++ b/examples/training/huggingface/bert/task_ner/run_ner.py
@@ -28,6 +28,7 @@
 
 import numpy as np
 from datasets import ClassLabel, load_dataset, load_metric
+import torch
 
 import transformers
 from transformers import (
@@ -43,7 +44,7 @@
 )
 from transformers.trainer_utils import get_last_checkpoint
 from transformers.utils import check_min_version
-from ls_hf_transformer_encoder_layer import inject_ls_enc_layer
+from ls_hf_transformer_layer import inject_ls_layer
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
@@ -94,9 +95,15 @@ class ModelArguments:
             "with private models)."
         },
     )
-    with_lightseq: bool = field(
-        default=True,
-        metadata={"help": "Whether to use lightseq TransformerEncoder"},
+    module_type: int = field(
+        default=1,
+        metadata={
+            "help": "0: original Hugging Face layer, 1: LightSeq CUDA layer, 2: custom Torch layer"
+        },
+    )
+    enable_quant: bool = field(
+        default=False,
+        metadata={"help": "Whether to enable quantization"},
     )
 
 
@@ -369,8 +376,8 @@ def get_label_list(labels):
     )
 
     # Replace with LightSeq encoder layers.
-    if model_args.with_lightseq:
-        inject_ls_enc_layer(model, training_args, config)
+    if model_args.module_type == 1 or model_args.module_type == 2:
+        inject_ls_layer(model, training_args, model_args, config)
 
     # Tokenizer check: this script requires a fast tokenizer.
     if not isinstance(tokenizer, PreTrainedTokenizerFast):
@@ -513,6 +520,12 @@ def compute_metrics(p):
         compute_metrics=compute_metrics,
     )
 
+    if not training_args.do_train:
+        state_dict = torch.load(
+            training_args.resume_from_checkpoint, map_location="cpu"
+        )
+        trainer._load_state_dict_in_model(state_dict)
+
     # Training
     if training_args.do_train:
         checkpoint = None
diff --git a/examples/training/huggingface/run_ner.sh b/examples/training/huggingface/bert/task_ner/run_ner.sh
similarity index 82%
rename from examples/training/huggingface/run_ner.sh
rename to examples/training/huggingface/bert/task_ner/run_ner.sh
index e37695d1..2664fdbb 100644
--- a/examples/training/huggingface/run_ner.sh
+++ b/examples/training/huggingface/bert/task_ner/run_ner.sh
@@ -14,19 +14,19 @@
 
 THIS_DIR=$(dirname $(readlink -f $0))
 
-if [ -d "/tmp/test-ner/" ]; then
-  rm -rf /tmp/test-ner/
-fi
-
 python3 -m torch.distributed.launch \
   --nproc_per_node=1 \
   $THIS_DIR/run_ner.py \
-  --model_name_or_path bert-large-uncased \
-  --per_device_train_batch_size 16 \
+  --model_name_or_path bert-base-uncased \
   --dataset_name conll2003 \
-  --output_dir /tmp/test-ner \
   --do_train \
   --do_eval \
-  --num_train_epochs 1 \
-  --with_lightseq true \
+  --per_device_train_batch_size 16 \
+  --num_train_epochs 10 \
+  --output_dir /tmp/test-ner \
+  --overwrite_output_dir \
   --fp16 \
+  --seed 1234 \
+  --logging_steps 10 \
+  --module_type 2 \
+  --enable_quant false
diff --git a/examples/training/huggingface/run_ner_no_trainer.sh b/examples/training/huggingface/bert/task_ner/run_quant_ner.sh
similarity index 61%
rename from examples/training/huggingface/run_ner_no_trainer.sh
rename to examples/training/huggingface/bert/task_ner/run_quant_ner.sh
index 278aa9cc..3d962e66 100644
--- a/examples/training/huggingface/run_ner_no_trainer.sh
+++ b/examples/training/huggingface/bert/task_ner/run_quant_ner.sh
@@ -14,13 +14,20 @@
 
 THIS_DIR=$(dirname $(readlink -f $0))
 
-if [ -d "/tmp/test-ner/" ]; then
-  rm -rf /tmp/test-ner/
-fi
-
-accelerate launch $THIS_DIR/run_ner_no_trainer.py \
-  --model_name_or_path bert-large-uncased \
+python3 -m torch.distributed.launch \
+  --nproc_per_node=1 \
+  $THIS_DIR/run_ner.py \
+  --model_name_or_path bert-base-uncased \
   --dataset_name conll2003 \
-  --output_dir /tmp/test-ner \
-  --task_name ner \
-  --num_train_epochs 1
+  --do_train \
+  --do_eval \
+  --per_device_train_batch_size 16 \
+  --num_train_epochs 20 \
+  --output_dir /tmp/quant/test-ner \
+  --overwrite_output_dir \
+  --resume_from_checkpoint /tmp/test-ner/ \
+  --fp16 \
+  --seed 1234 \
+  --logging_steps 10 \
+  --module_type 2 \
+  --enable_quant true
diff --git a/examples/training/huggingface/bert/task_qa/run_qa.py b/examples/training/huggingface/bert/task_qa/run_qa.py
new file mode 100644
index 00000000..83c4fe02
--- /dev/null
+++ b/examples/training/huggingface/bert/task_qa/run_qa.py
@@ -0,0 +1,764 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The LightSeq Team
+# Copyright 2020 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for question answering using a slightly adapted version of the 🤗 Trainer.
+"""
+# You can also adapt this script on your own question answering task. Pointers for this are left as comments.
+
+import logging
+import os
+import sys
+from dataclasses import dataclass, field
+from typing import Optional
+
+import datasets
+from datasets import load_dataset, load_metric
+
+import transformers
+from trainer_qa import QuestionAnsweringTrainer
+from transformers import (
+    AutoConfig,
+    AutoModelForQuestionAnswering,
+    AutoTokenizer,
+    DataCollatorWithPadding,
+    EvalPrediction,
+    HfArgumentParser,
+    PreTrainedTokenizerFast,
+    TrainingArguments,
+    default_data_collator,
+    set_seed,
+)
+from transformers.trainer_utils import get_last_checkpoint
+from transformers.utils import check_min_version
+from transformers.utils.versions import require_version
+from utils_qa import postprocess_qa_predictions
+from ls_hf_transformer_layer import inject_ls_layer
+
+
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.17.0")
+
+require_version(
+    "datasets>=1.8.0",
+    "To fix: pip install -r examples/pytorch/question-answering/requirements.txt",
+)
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        metadata={
+            "help": "Path to pretrained model or model identifier from huggingface.co/models"
+        }
+    )
+    config_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Pretrained config name or path if not the same as model_name"
+        },
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Pretrained tokenizer name or path if not the same as model_name"
+        },
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Path to directory to store the pretrained models downloaded from huggingface.co"
+        },
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={
+            "help": "The specific model version to use (can be a branch name, tag name or commit id)."
+        },
+    )
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+            "with private models)."
+        },
+    )
+    module_type: int = field(
+        default=1,
+        metadata={
+            "help": "0: original Hugging Face layer, 1: LightSeq CUDA layer, 2: custom Torch layer"
+        },
+    )
+    enable_quant: bool = field(
+        default=False,
+        metadata={"help": "Whether to enable quantization"},
+    )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    dataset_name: Optional[str] = field(
+        default=None,
+        metadata={"help": "The name of the dataset to use (via the datasets library)."},
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The configuration name of the dataset to use (via the datasets library)."
+        },
+    )
+    train_file: Optional[str] = field(
+        default=None, metadata={"help": "The input training data file (a text file)."}
+    )
+    validation_file: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."
+        },
+    )
+    test_file: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "An optional input test data file to evaluate the perplexity on (a text file)."
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False,
+        metadata={"help": "Overwrite the cached training and evaluation sets"},
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    max_seq_length: int = field(
+        default=384,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded."
+        },
+    )
+    pad_to_max_length: bool = field(
+        default=True,
+        metadata={
+            "help": "Whether to pad all samples to `max_seq_length`. "
+            "If False, will pad the samples dynamically when batching to the maximum length in the batch (which can "
+            "be faster on GPU but will be slower on TPU)."
+        },
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+            "value if set."
+        },
+    )
+    max_predict_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
+            "value if set."
+        },
+    )
+    version_2_with_negative: bool = field(
+        default=False,
+        metadata={"help": "If true, some of the examples do not have an answer."},
+    )
+    null_score_diff_threshold: float = field(
+        default=0.0,
+        metadata={
+            "help": "The threshold used to select the null answer: if the best answer has a score that is less than "
+            "the score of the null answer minus this threshold, the null answer is selected for this example. "
+            "Only useful when `version_2_with_negative=True`."
+        },
+    )
+    doc_stride: int = field(
+        default=128,
+        metadata={
+            "help": "When splitting up a long document into chunks, how much stride to take between chunks."
+        },
+    )
+    n_best_size: int = field(
+        default=20,
+        metadata={
+            "help": "The total number of n-best predictions to generate when looking for an answer."
+        },
+    )
+    max_answer_length: int = field(
+        default=30,
+        metadata={
+            "help": "The maximum length of an answer that can be generated. This is needed because the start "
+            "and end predictions are not conditioned on one another."
+        },
+    )
+
+    def __post_init__(self):
+        if (
+            self.dataset_name is None
+            and self.train_file is None
+            and self.validation_file is None
+            and self.test_file is None
+        ):
+            raise ValueError(
+                "Need either a dataset name or a training/validation file/test_file."
+            )
+        else:
+            if self.train_file is not None:
+                extension = self.train_file.split(".")[-1]
+                assert extension in [
+                    "csv",
+                    "json",
+                ], "`train_file` should be a csv or a json file."
+            if self.validation_file is not None:
+                extension = self.validation_file.split(".")[-1]
+                assert extension in [
+                    "csv",
+                    "json",
+                ], "`validation_file` should be a csv or a json file."
+            if self.test_file is not None:
+                extension = self.test_file.split(".")[-1]
+                assert extension in [
+                    "csv",
+                    "json",
+                ], "`test_file` should be a csv or a json file."
+
+
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser(
+        (ModelArguments, DataTrainingArguments, TrainingArguments)
+    )
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(
+            json_file=os.path.abspath(sys.argv[1])
+        )
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    datasets.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()
+
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+    logger.info(f"Training/evaluation parameters {training_args}")
+
+    # Detecting last checkpoint.
+    last_checkpoint = None
+    if (
+        os.path.isdir(training_args.output_dir)
+        and training_args.do_train
+        and not training_args.overwrite_output_dir
+    ):
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif (
+            last_checkpoint is not None and training_args.resume_from_checkpoint is None
+        ):
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+
+    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
+    # 'text' is found. You can easily tweak this behavior (see below).
+    #
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    if data_args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        raw_datasets = load_dataset(
+            data_args.dataset_name,
+            data_args.dataset_config_name,
+            cache_dir=model_args.cache_dir,
+        )
+    else:
+        data_files = {}
+        if data_args.train_file is not None:
+            data_files["train"] = data_args.train_file
+            extension = data_args.train_file.split(".")[-1]
+
+        if data_args.validation_file is not None:
+            data_files["validation"] = data_args.validation_file
+            extension = data_args.validation_file.split(".")[-1]
+        if data_args.test_file is not None:
+            data_files["test"] = data_args.test_file
+            extension = data_args.test_file.split(".")[-1]
+        raw_datasets = load_dataset(
+            extension,
+            data_files=data_files,
+            field="data",
+            cache_dir=model_args.cache_dir,
+        )
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+
+    # Load pretrained model and tokenizer
+    #
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    config = AutoConfig.from_pretrained(
+        model_args.config_name
+        if model_args.config_name
+        else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.tokenizer_name
+        if model_args.tokenizer_name
+        else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        use_fast=True,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    model = AutoModelForQuestionAnswering.from_pretrained(
+        model_args.model_name_or_path,
+        from_tf=bool(".ckpt" in model_args.model_name_or_path),
+        config=config,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+
+    # Replace with LightSeq encoder layers.
+    if model_args.module_type == 1 or model_args.module_type == 2:
+        inject_ls_layer(model, training_args, model_args, config)
+
+    # Tokenizer check: this script requires a fast tokenizer.
+    if not isinstance(tokenizer, PreTrainedTokenizerFast):
+        raise ValueError(
+            "This example script only works for models that have a fast tokenizer. Checkout the big table of models "
+            "at https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet this "
+            "requirement"
+        )
+
+    # Preprocessing the datasets.
+    # Preprocessing is slighlty different for training and evaluation.
+    if training_args.do_train:
+        column_names = raw_datasets["train"].column_names
+    elif training_args.do_eval:
+        column_names = raw_datasets["validation"].column_names
+    else:
+        column_names = raw_datasets["test"].column_names
+    question_column_name = "question" if "question" in column_names else column_names[0]
+    context_column_name = "context" if "context" in column_names else column_names[1]
+    answer_column_name = "answers" if "answers" in column_names else column_names[2]
+
+    # Padding side determines if we do (question|context) or (context|question).
+    pad_on_right = tokenizer.padding_side == "right"
+
+    if data_args.max_seq_length > tokenizer.model_max_length:
+        logger.warning(
+            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
+            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
+        )
+    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
+
+    # Training preprocessing
+    def prepare_train_features(examples):
+        # Some of the questions have lots of whitespace on the left, which is not useful and will make the
+        # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
+        # left whitespace
+        examples[question_column_name] = [
+            q.lstrip() for q in examples[question_column_name]
+        ]
+
+        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
+        # in one example possible giving several features when a context is long, each of those features having a
+        # context that overlaps a bit the context of the previous feature.
+        tokenized_examples = tokenizer(
+            examples[question_column_name if pad_on_right else context_column_name],
+            examples[context_column_name if pad_on_right else question_column_name],
+            truncation="only_second" if pad_on_right else "only_first",
+            max_length=max_seq_length,
+            stride=data_args.doc_stride,
+            return_overflowing_tokens=True,
+            return_offsets_mapping=True,
+            padding="max_length" if data_args.pad_to_max_length else False,
+        )
+
+        # Since one example might give us several features if it has a long context, we need a map from a feature to
+        # its corresponding example. This key gives us just that.
+        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+        # The offset mappings will give us a map from token to character position in the original context. This will
+        # help us compute the start_positions and end_positions.
+        offset_mapping = tokenized_examples.pop("offset_mapping")
+
+        # Let's label those examples!
+        tokenized_examples["start_positions"] = []
+        tokenized_examples["end_positions"] = []
+
+        for i, offsets in enumerate(offset_mapping):
+            # We will label impossible answers with the index of the CLS token.
+            input_ids = tokenized_examples["input_ids"][i]
+            cls_index = input_ids.index(tokenizer.cls_token_id)
+
+            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
+            sequence_ids = tokenized_examples.sequence_ids(i)
+
+            # One example can give several spans, this is the index of the example containing this span of text.
+            sample_index = sample_mapping[i]
+            answers = examples[answer_column_name][sample_index]
+            # If no answers are given, set the cls_index as answer.
+            if len(answers["answer_start"]) == 0:
+                tokenized_examples["start_positions"].append(cls_index)
+                tokenized_examples["end_positions"].append(cls_index)
+            else:
+                # Start/end character index of the answer in the text.
+                start_char = answers["answer_start"][0]
+                end_char = start_char + len(answers["text"][0])
+
+                # Start token index of the current span in the text.
+                token_start_index = 0
+                while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
+                    token_start_index += 1
+
+                # End token index of the current span in the text.
+                token_end_index = len(input_ids) - 1
+                while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
+                    token_end_index -= 1
+
+                # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
+                if not (
+                    offsets[token_start_index][0] <= start_char
+                    and offsets[token_end_index][1] >= end_char
+                ):
+                    tokenized_examples["start_positions"].append(cls_index)
+                    tokenized_examples["end_positions"].append(cls_index)
+                else:
+                    # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
+                    # Note: we could go after the last offset if the answer is the last word (edge case).
+                    while (
+                        token_start_index < len(offsets)
+                        and offsets[token_start_index][0] <= start_char
+                    ):
+                        token_start_index += 1
+                    tokenized_examples["start_positions"].append(token_start_index - 1)
+                    while offsets[token_end_index][1] >= end_char:
+                        token_end_index -= 1
+                    tokenized_examples["end_positions"].append(token_end_index + 1)
+
+        return tokenized_examples
+
+    if training_args.do_train:
+        if "train" not in raw_datasets:
+            raise ValueError("--do_train requires a train dataset")
+        train_dataset = raw_datasets["train"]
+        if data_args.max_train_samples is not None:
+            # We will select sample from whole data if argument is specified
+            train_dataset = train_dataset.select(range(data_args.max_train_samples))
+        # Create train feature from dataset
+        with training_args.main_process_first(desc="train dataset map pre-processing"):
+            train_dataset = train_dataset.map(
+                prepare_train_features,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=column_names,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on train dataset",
+            )
+        if data_args.max_train_samples is not None:
+            # Number of samples might increase during Feature Creation, We select only specified max samples
+            train_dataset = train_dataset.select(range(data_args.max_train_samples))
+
+    # Validation preprocessing
+    def prepare_validation_features(examples):
+        # Some of the questions have lots of whitespace on the left, which is not useful and will make the
+        # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
+        # left whitespace
+        examples[question_column_name] = [
+            q.lstrip() for q in examples[question_column_name]
+        ]
+
+        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
+        # in one example possible giving several features when a context is long, each of those features having a
+        # context that overlaps a bit the context of the previous feature.
+        tokenized_examples = tokenizer(
+            examples[question_column_name if pad_on_right else context_column_name],
+            examples[context_column_name if pad_on_right else question_column_name],
+            truncation="only_second" if pad_on_right else "only_first",
+            max_length=max_seq_length,
+            stride=data_args.doc_stride,
+            return_overflowing_tokens=True,
+            return_offsets_mapping=True,
+            padding="max_length" if data_args.pad_to_max_length else False,
+        )
+
+        # Since one example might give us several features if it has a long context, we need a map from a feature to
+        # its corresponding example. This key gives us just that.
+        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+
+        # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
+        # corresponding example_id and we will store the offset mappings.
+        tokenized_examples["example_id"] = []
+
+        for i in range(len(tokenized_examples["input_ids"])):
+            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
+            sequence_ids = tokenized_examples.sequence_ids(i)
+            context_index = 1 if pad_on_right else 0
+
+            # One example can give several spans, this is the index of the example containing this span of text.
+            sample_index = sample_mapping[i]
+            tokenized_examples["example_id"].append(examples["id"][sample_index])
+
+            # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
+            # position is part of the context or not.
+            tokenized_examples["offset_mapping"][i] = [
+                (o if sequence_ids[k] == context_index else None)
+                for k, o in enumerate(tokenized_examples["offset_mapping"][i])
+            ]
+
+        return tokenized_examples
+
+    if training_args.do_eval:
+        if "validation" not in raw_datasets:
+            raise ValueError("--do_eval requires a validation dataset")
+        eval_examples = raw_datasets["validation"]
+        if data_args.max_eval_samples is not None:
+            # We will select sample from whole data
+            eval_examples = eval_examples.select(range(data_args.max_eval_samples))
+        # Validation Feature Creation
+        with training_args.main_process_first(
+            desc="validation dataset map pre-processing"
+        ):
+            eval_dataset = eval_examples.map(
+                prepare_validation_features,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=column_names,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on validation dataset",
+            )
+        if data_args.max_eval_samples is not None:
+            # During Feature creation dataset samples might increase, we will select required samples again
+            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
+
+    if training_args.do_predict:
+        if "test" not in raw_datasets:
+            raise ValueError("--do_predict requires a test dataset")
+        predict_examples = raw_datasets["test"]
+        if data_args.max_predict_samples is not None:
+            # We will select sample from whole data
+            predict_examples = predict_examples.select(
+                range(data_args.max_predict_samples)
+            )
+        # Predict Feature Creation
+        with training_args.main_process_first(
+            desc="prediction dataset map pre-processing"
+        ):
+            predict_dataset = predict_examples.map(
+                prepare_validation_features,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=column_names,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on prediction dataset",
+            )
+        if data_args.max_predict_samples is not None:
+            # During Feature creation dataset samples might increase, we will select required samples again
+            predict_dataset = predict_dataset.select(
+                range(data_args.max_predict_samples)
+            )
+
+    # Data collator
+    # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data
+    # collator.
+    data_collator = (
+        default_data_collator
+        if data_args.pad_to_max_length
+        else DataCollatorWithPadding(
+            tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None
+        )
+    )
+
+    # Post-processing:
+    def post_processing_function(examples, features, predictions, stage="eval"):
+        # Post-processing: we match the start logits and end logits to answers in the original context.
+        predictions = postprocess_qa_predictions(
+            examples=examples,
+            features=features,
+            predictions=predictions,
+            version_2_with_negative=data_args.version_2_with_negative,
+            n_best_size=data_args.n_best_size,
+            max_answer_length=data_args.max_answer_length,
+            null_score_diff_threshold=data_args.null_score_diff_threshold,
+            output_dir=training_args.output_dir,
+            log_level=log_level,
+            prefix=stage,
+        )
+        # Format the result to the format the metric expects.
+        if data_args.version_2_with_negative:
+            formatted_predictions = [
+                {"id": k, "prediction_text": v, "no_answer_probability": 0.0}
+                for k, v in predictions.items()
+            ]
+        else:
+            formatted_predictions = [
+                {"id": k, "prediction_text": v} for k, v in predictions.items()
+            ]
+
+        references = [
+            {"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples
+        ]
+        return EvalPrediction(predictions=formatted_predictions, label_ids=references)
+
+    metric = load_metric("squad_v2" if data_args.version_2_with_negative else "squad")
+
+    def compute_metrics(p: EvalPrediction):
+        return metric.compute(predictions=p.predictions, references=p.label_ids)
+
+    # Initialize our Trainer
+    trainer = QuestionAnsweringTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset if training_args.do_train else None,
+        eval_dataset=eval_dataset if training_args.do_eval else None,
+        eval_examples=eval_examples if training_args.do_eval else None,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        post_process_function=post_processing_function,
+        compute_metrics=compute_metrics,
+    )
+
+    # Training
+    if training_args.do_train:
+        checkpoint = None
+        if training_args.resume_from_checkpoint is not None:
+            checkpoint = training_args.resume_from_checkpoint
+        elif last_checkpoint is not None:
+            checkpoint = last_checkpoint
+        train_result = trainer.train(resume_from_checkpoint=checkpoint)
+        trainer.save_model()  # Saves the tokenizer too for easy upload
+
+        metrics = train_result.metrics
+        max_train_samples = (
+            data_args.max_train_samples
+            if data_args.max_train_samples is not None
+            else len(train_dataset)
+        )
+        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
+
+        trainer.log_metrics("train", metrics)
+        trainer.save_metrics("train", metrics)
+        trainer.save_state()
+
+    # Evaluation
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+        metrics = trainer.evaluate()
+
+        max_eval_samples = (
+            data_args.max_eval_samples
+            if data_args.max_eval_samples is not None
+            else len(eval_dataset)
+        )
+        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
+
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    # Prediction
+    if training_args.do_predict:
+        logger.info("*** Predict ***")
+        results = trainer.predict(predict_dataset, predict_examples)
+        metrics = results.metrics
+
+        max_predict_samples = (
+            data_args.max_predict_samples
+            if data_args.max_predict_samples is not None
+            else len(predict_dataset)
+        )
+        metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))
+
+        trainer.log_metrics("predict", metrics)
+        trainer.save_metrics("predict", metrics)
+
+    kwargs = {
+        "finetuned_from": model_args.model_name_or_path,
+        "tasks": "question-answering",
+    }
+    if data_args.dataset_name is not None:
+        kwargs["dataset_tags"] = data_args.dataset_name
+        if data_args.dataset_config_name is not None:
+            kwargs["dataset_args"] = data_args.dataset_config_name
+            kwargs[
+                "dataset"
+            ] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
+        else:
+            kwargs["dataset"] = data_args.dataset_name
+
+    if training_args.push_to_hub:
+        trainer.push_to_hub(**kwargs)
+    else:
+        trainer.create_model_card(**kwargs)
+
+
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    main()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/training/huggingface/bert/task_qa/run_qa.sh b/examples/training/huggingface/bert/task_qa/run_qa.sh
new file mode 100644
index 00000000..61346d8d
--- /dev/null
+++ b/examples/training/huggingface/bert/task_qa/run_qa.sh
@@ -0,0 +1,35 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+THIS_DIR=$(dirname $(readlink -f $0))
+
+python3 -m torch.distributed.launch \
+  --nproc_per_node=1 \
+  $THIS_DIR/run_qa.py \
+  --model_name_or_path bert-base-uncased \
+  --dataset_name squad \
+  --do_train \
+  --do_eval \
+  --max_seq_length 256 \
+  --per_device_train_batch_size 16 \
+  --doc_stride 128 \
+  --learning_rate 3e-5 \
+  --num_train_epochs 10 \
+  --output_dir /tmp/squad \
+  --overwrite_output_dir \
+  --fp16 \
+  --seed 1234 \
+  --logging_steps 10 \
+  --module_type 1 \
+  --enable_quant false
diff --git a/examples/training/huggingface/bert/task_qa/trainer_qa.py b/examples/training/huggingface/bert/task_qa/trainer_qa.py
new file mode 100644
index 00000000..c3c2ba01
--- /dev/null
+++ b/examples/training/huggingface/bert/task_qa/trainer_qa.py
@@ -0,0 +1,135 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+A subclass of `Trainer` specific to Question-Answering tasks
+"""
+
+from transformers import Trainer, is_torch_tpu_available
+from transformers.trainer_utils import PredictionOutput
+
+
+if is_torch_tpu_available():
+    import torch_xla.core.xla_model as xm
+    import torch_xla.debug.metrics as met
+
+
+class QuestionAnsweringTrainer(Trainer):
+    def __init__(self, *args, eval_examples=None, post_process_function=None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.eval_examples = eval_examples
+        self.post_process_function = post_process_function
+
+    def evaluate(
+        self,
+        eval_dataset=None,
+        eval_examples=None,
+        ignore_keys=None,
+        metric_key_prefix: str = "eval",
+    ):
+        eval_dataset = self.eval_dataset if eval_dataset is None else eval_dataset
+        eval_dataloader = self.get_eval_dataloader(eval_dataset)
+        eval_examples = self.eval_examples if eval_examples is None else eval_examples
+
+        # Temporarily disable metric computation, we will do it in the loop here.
+        compute_metrics = self.compute_metrics
+        self.compute_metrics = None
+        eval_loop = (
+            self.prediction_loop
+            if self.args.use_legacy_prediction_loop
+            else self.evaluation_loop
+        )
+        try:
+            output = eval_loop(
+                eval_dataloader,
+                description="Evaluation",
+                # No point gathering the predictions if there are no metrics, otherwise we defer to
+                # self.args.prediction_loss_only
+                prediction_loss_only=True if compute_metrics is None else None,
+                ignore_keys=ignore_keys,
+            )
+        finally:
+            self.compute_metrics = compute_metrics
+
+        if self.post_process_function is not None and self.compute_metrics is not None:
+            eval_preds = self.post_process_function(
+                eval_examples, eval_dataset, output.predictions
+            )
+            metrics = self.compute_metrics(eval_preds)
+
+            # Prefix all keys with metric_key_prefix + '_'
+            for key in list(metrics.keys()):
+                if not key.startswith(f"{metric_key_prefix}_"):
+                    metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
+
+            self.log(metrics)
+        else:
+            metrics = {}
+
+        if self.args.tpu_metrics_debug or self.args.debug:
+            # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
+            xm.master_print(met.metrics_report())
+
+        self.control = self.callback_handler.on_evaluate(
+            self.args, self.state, self.control, metrics
+        )
+        return metrics
+
+    def predict(
+        self,
+        predict_dataset,
+        predict_examples,
+        ignore_keys=None,
+        metric_key_prefix: str = "test",
+    ):
+        predict_dataloader = self.get_test_dataloader(predict_dataset)
+
+        # Temporarily disable metric computation, we will do it in the loop here.
+        compute_metrics = self.compute_metrics
+        self.compute_metrics = None
+        eval_loop = (
+            self.prediction_loop
+            if self.args.use_legacy_prediction_loop
+            else self.evaluation_loop
+        )
+        try:
+            output = eval_loop(
+                predict_dataloader,
+                description="Prediction",
+                # No point gathering the predictions if there are no metrics, otherwise we defer to
+                # self.args.prediction_loss_only
+                prediction_loss_only=True if compute_metrics is None else None,
+                ignore_keys=ignore_keys,
+            )
+        finally:
+            self.compute_metrics = compute_metrics
+
+        if self.post_process_function is None or self.compute_metrics is None:
+            return output
+
+        predictions = self.post_process_function(
+            predict_examples, predict_dataset, output.predictions, "predict"
+        )
+        metrics = self.compute_metrics(predictions)
+
+        # Prefix all keys with metric_key_prefix + '_'
+        for key in list(metrics.keys()):
+            if not key.startswith(f"{metric_key_prefix}_"):
+                metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
+
+        return PredictionOutput(
+            predictions=predictions.predictions,
+            label_ids=predictions.label_ids,
+            metrics=metrics,
+        )
diff --git a/examples/training/huggingface/bert/task_qa/utils_qa.py b/examples/training/huggingface/bert/task_qa/utils_qa.py
new file mode 100644
index 00000000..c1c5c10b
--- /dev/null
+++ b/examples/training/huggingface/bert/task_qa/utils_qa.py
@@ -0,0 +1,520 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Post-processing utilities for question answering.
+"""
+import collections
+import json
+import logging
+import os
+from typing import Optional, Tuple
+
+import numpy as np
+from tqdm.auto import tqdm
+
+
+logger = logging.getLogger(__name__)
+
+
+def postprocess_qa_predictions(
+    examples,
+    features,
+    predictions: Tuple[np.ndarray, np.ndarray],
+    version_2_with_negative: bool = False,
+    n_best_size: int = 20,
+    max_answer_length: int = 30,
+    null_score_diff_threshold: float = 0.0,
+    output_dir: Optional[str] = None,
+    prefix: Optional[str] = None,
+    log_level: Optional[int] = logging.WARNING,
+):
+    """
+    Post-processes the predictions of a question-answering model to convert them to answers that are substrings of the
+    original contexts. This is the base postprocessing functions for models that only return start and end logits.
+
+    Args:
+        examples: The non-preprocessed dataset (see the main script for more information).
+        features: The processed dataset (see the main script for more information).
+        predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
+            The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
+            first dimension must match the number of elements of :obj:`features`.
+        version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not the underlying dataset contains examples with no answers.
+        n_best_size (:obj:`int`, `optional`, defaults to 20):
+            The total number of n-best predictions to generate when looking for an answer.
+        max_answer_length (:obj:`int`, `optional`, defaults to 30):
+            The maximum length of an answer that can be generated. This is needed because the start and end predictions
+            are not conditioned on one another.
+        null_score_diff_threshold (:obj:`float`, `optional`, defaults to 0):
+            The threshold used to select the null answer: if the best answer has a score that is less than the score of
+            the null answer minus this threshold, the null answer is selected for this example (note that the score of
+            the null answer for an example giving several features is the minimum of the scores for the null answer on
+            each feature: all features must be aligned on the fact they `want` to predict a null answer).
+
+            Only useful when :obj:`version_2_with_negative` is :obj:`True`.
+        output_dir (:obj:`str`, `optional`):
+            If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if
+            :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null
+            answers, are saved in `output_dir`.
+        prefix (:obj:`str`, `optional`):
+            If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
+        log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``):
+            ``logging`` log level (e.g., ``logging.WARNING``)
+    """
+    if len(predictions) != 2:
+        raise ValueError(
+            "`predictions` should be a tuple with two elements (start_logits, end_logits)."
+        )
+    all_start_logits, all_end_logits = predictions
+
+    if len(predictions[0]) != len(features):
+        raise ValueError(
+            f"Got {len(predictions[0])} predictions and {len(features)} features."
+        )
+
+    # Build a map example to its corresponding features.
+    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
+    features_per_example = collections.defaultdict(list)
+    for i, feature in enumerate(features):
+        features_per_example[example_id_to_index[feature["example_id"]]].append(i)
+
+    # The dictionaries we have to fill.
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    if version_2_with_negative:
+        scores_diff_json = collections.OrderedDict()
+
+    # Logging.
+    logger.setLevel(log_level)
+    logger.info(
+        f"Post-processing {len(examples)} example predictions split into {len(features)} features."
+    )
+
+    # Let's loop over all the examples!
+    for example_index, example in enumerate(tqdm(examples)):
+        # Those are the indices of the features associated to the current example.
+        feature_indices = features_per_example[example_index]
+
+        min_null_prediction = None
+        prelim_predictions = []
+
+        # Looping through all the features associated to the current example.
+        for feature_index in feature_indices:
+            # We grab the predictions of the model for this feature.
+            start_logits = all_start_logits[feature_index]
+            end_logits = all_end_logits[feature_index]
+            # This is what will allow us to map some the positions in our logits to span of texts in the original
+            # context.
+            offset_mapping = features[feature_index]["offset_mapping"]
+            # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context
+            # available in the current feature.
+            token_is_max_context = features[feature_index].get(
+                "token_is_max_context", None
+            )
+
+            # Update minimum null prediction.
+            feature_null_score = start_logits[0] + end_logits[0]
+            if (
+                min_null_prediction is None
+                or min_null_prediction["score"] > feature_null_score
+            ):
+                min_null_prediction = {
+                    "offsets": (0, 0),
+                    "score": feature_null_score,
+                    "start_logit": start_logits[0],
+                    "end_logit": end_logits[0],
+                }
+
+            # Go through all possibilities for the `n_best_size` greater start and end logits.
+            start_indexes = np.argsort(start_logits)[
+                -1 : -n_best_size - 1 : -1
+            ].tolist()
+            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
+            for start_index in start_indexes:
+                for end_index in end_indexes:
+                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
+                    # to part of the input_ids that are not in the context.
+                    if (
+                        start_index >= len(offset_mapping)
+                        or end_index >= len(offset_mapping)
+                        or offset_mapping[start_index] is None
+                        or len(offset_mapping[start_index]) < 2
+                        or offset_mapping[end_index] is None
+                        or len(offset_mapping[end_index]) < 2
+                    ):
+                        continue
+                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
+                    if (
+                        end_index < start_index
+                        or end_index - start_index + 1 > max_answer_length
+                    ):
+                        continue
+                    # Don't consider answer that don't have the maximum context available (if such information is
+                    # provided).
+                    if (
+                        token_is_max_context is not None
+                        and not token_is_max_context.get(str(start_index), False)
+                    ):
+                        continue
+
+                    prelim_predictions.append(
+                        {
+                            "offsets": (
+                                offset_mapping[start_index][0],
+                                offset_mapping[end_index][1],
+                            ),
+                            "score": start_logits[start_index] + end_logits[end_index],
+                            "start_logit": start_logits[start_index],
+                            "end_logit": end_logits[end_index],
+                        }
+                    )
+        if version_2_with_negative:
+            # Add the minimum null prediction
+            prelim_predictions.append(min_null_prediction)
+            null_score = min_null_prediction["score"]
+
+        # Only keep the best `n_best_size` predictions.
+        predictions = sorted(
+            prelim_predictions, key=lambda x: x["score"], reverse=True
+        )[:n_best_size]
+
+        # Add back the minimum null prediction if it was removed because of its low score.
+        if version_2_with_negative and not any(
+            p["offsets"] == (0, 0) for p in predictions
+        ):
+            predictions.append(min_null_prediction)
+
+        # Use the offsets to gather the answer text in the original context.
+        context = example["context"]
+        for pred in predictions:
+            offsets = pred.pop("offsets")
+            pred["text"] = context[offsets[0] : offsets[1]]
+
+        # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
+        # failure.
+        if len(predictions) == 0 or (
+            len(predictions) == 1 and predictions[0]["text"] == ""
+        ):
+            predictions.insert(
+                0, {"text": "empty", "start_logit": 0.0, "end_logit": 0.0, "score": 0.0}
+            )
+
+        # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
+        # the LogSumExp trick).
+        scores = np.array([pred.pop("score") for pred in predictions])
+        exp_scores = np.exp(scores - np.max(scores))
+        probs = exp_scores / exp_scores.sum()
+
+        # Include the probabilities in our predictions.
+        for prob, pred in zip(probs, predictions):
+            pred["probability"] = prob
+
+        # Pick the best prediction. If the null answer is not possible, this is easy.
+        if not version_2_with_negative:
+            all_predictions[example["id"]] = predictions[0]["text"]
+        else:
+            # Otherwise we first need to find the best non-empty prediction.
+            i = 0
+            while predictions[i]["text"] == "":
+                i += 1
+            best_non_null_pred = predictions[i]
+
+            # Then we compare to the null prediction using the threshold.
+            score_diff = (
+                null_score
+                - best_non_null_pred["start_logit"]
+                - best_non_null_pred["end_logit"]
+            )
+            scores_diff_json[example["id"]] = float(
+                score_diff
+            )  # To be JSON-serializable.
+            if score_diff > null_score_diff_threshold:
+                all_predictions[example["id"]] = ""
+            else:
+                all_predictions[example["id"]] = best_non_null_pred["text"]
+
+        # Make `predictions` JSON-serializable by casting np.float back to float.
+        all_nbest_json[example["id"]] = [
+            {
+                k: (
+                    float(v)
+                    if isinstance(v, (np.float16, np.float32, np.float64))
+                    else v
+                )
+                for k, v in pred.items()
+            }
+            for pred in predictions
+        ]
+
+    # If we have an output_dir, let's save all those dicts.
+    if output_dir is not None:
+        if not os.path.isdir(output_dir):
+            raise EnvironmentError(f"{output_dir} is not a directory.")
+
+        prediction_file = os.path.join(
+            output_dir,
+            "predictions.json" if prefix is None else f"{prefix}_predictions.json",
+        )
+        nbest_file = os.path.join(
+            output_dir,
+            "nbest_predictions.json"
+            if prefix is None
+            else f"{prefix}_nbest_predictions.json",
+        )
+        if version_2_with_negative:
+            null_odds_file = os.path.join(
+                output_dir,
+                "null_odds.json" if prefix is None else f"{prefix}_null_odds.json",
+            )
+
+        logger.info(f"Saving predictions to {prediction_file}.")
+        with open(prediction_file, "w") as writer:
+            writer.write(json.dumps(all_predictions, indent=4) + "\n")
+        logger.info(f"Saving nbest_preds to {nbest_file}.")
+        with open(nbest_file, "w") as writer:
+            writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+        if version_2_with_negative:
+            logger.info(f"Saving null_odds to {null_odds_file}.")
+            with open(null_odds_file, "w") as writer:
+                writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+    return all_predictions
+
+
+def postprocess_qa_predictions_with_beam_search(
+    examples,
+    features,
+    predictions: Tuple[np.ndarray, np.ndarray],
+    version_2_with_negative: bool = False,
+    n_best_size: int = 20,
+    max_answer_length: int = 30,
+    start_n_top: int = 5,
+    end_n_top: int = 5,
+    output_dir: Optional[str] = None,
+    prefix: Optional[str] = None,
+    log_level: Optional[int] = logging.WARNING,
+):
+    """
+    Post-processes the predictions of a question-answering model with beam search to convert them to answers that are substrings of the
+    original contexts. This is the postprocessing functions for models that return start and end logits, indices, as well as
+    cls token predictions.
+
+    Args:
+        examples: The non-preprocessed dataset (see the main script for more information).
+        features: The processed dataset (see the main script for more information).
+        predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
+            The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
+            first dimension must match the number of elements of :obj:`features`.
+        version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not the underlying dataset contains examples with no answers.
+        n_best_size (:obj:`int`, `optional`, defaults to 20):
+            The total number of n-best predictions to generate when looking for an answer.
+        max_answer_length (:obj:`int`, `optional`, defaults to 30):
+            The maximum length of an answer that can be generated. This is needed because the start and end predictions
+            are not conditioned on one another.
+        start_n_top (:obj:`int`, `optional`, defaults to 5):
+            The number of top start logits too keep when searching for the :obj:`n_best_size` predictions.
+        end_n_top (:obj:`int`, `optional`, defaults to 5):
+            The number of top end logits too keep when searching for the :obj:`n_best_size` predictions.
+        output_dir (:obj:`str`, `optional`):
+            If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if
+            :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null
+            answers, are saved in `output_dir`.
+        prefix (:obj:`str`, `optional`):
+            If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
+        log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``):
+            ``logging`` log level (e.g., ``logging.WARNING``)
+    """
+    if len(predictions) != 5:
+        raise ValueError("`predictions` should be a tuple with five elements.")
+    (
+        start_top_log_probs,
+        start_top_index,
+        end_top_log_probs,
+        end_top_index,
+        cls_logits,
+    ) = predictions
+
+    if len(predictions[0]) != len(features):
+        raise ValueError(
+            f"Got {len(predictions[0])} predictions and {len(features)} features."
+        )
+
+    # Build a map example to its corresponding features.
+    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
+    features_per_example = collections.defaultdict(list)
+    for i, feature in enumerate(features):
+        features_per_example[example_id_to_index[feature["example_id"]]].append(i)
+
+    # The dictionaries we have to fill.
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    scores_diff_json = collections.OrderedDict() if version_2_with_negative else None
+
+    # Logging.
+    logger.setLevel(log_level)
+    logger.info(
+        f"Post-processing {len(examples)} example predictions split into {len(features)} features."
+    )
+
+    # Let's loop over all the examples!
+    for example_index, example in enumerate(tqdm(examples)):
+        # Those are the indices of the features associated to the current example.
+        feature_indices = features_per_example[example_index]
+
+        min_null_score = None
+        prelim_predictions = []
+
+        # Looping through all the features associated to the current example.
+        for feature_index in feature_indices:
+            # We grab the predictions of the model for this feature.
+            start_log_prob = start_top_log_probs[feature_index]
+            start_indexes = start_top_index[feature_index]
+            end_log_prob = end_top_log_probs[feature_index]
+            end_indexes = end_top_index[feature_index]
+            feature_null_score = cls_logits[feature_index]
+            # This is what will allow us to map some the positions in our logits to span of texts in the original
+            # context.
+            offset_mapping = features[feature_index]["offset_mapping"]
+            # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context
+            # available in the current feature.
+            token_is_max_context = features[feature_index].get(
+                "token_is_max_context", None
+            )
+
+            # Update minimum null prediction
+            if min_null_score is None or feature_null_score < min_null_score:
+                min_null_score = feature_null_score
+
+            # Go through all possibilities for the `n_start_top`/`n_end_top` greater start and end logits.
+            for i in range(start_n_top):
+                for j in range(end_n_top):
+                    start_index = int(start_indexes[i])
+                    j_index = i * end_n_top + j
+                    end_index = int(end_indexes[j_index])
+                    # Don't consider out-of-scope answers (last part of the test should be unnecessary because of the
+                    # p_mask but let's not take any risk)
+                    if (
+                        start_index >= len(offset_mapping)
+                        or end_index >= len(offset_mapping)
+                        or offset_mapping[start_index] is None
+                        or offset_mapping[end_index] is None
+                    ):
+                        continue
+                    # Don't consider answers with a length negative or > max_answer_length.
+                    if (
+                        end_index < start_index
+                        or end_index - start_index + 1 > max_answer_length
+                    ):
+                        continue
+                    # Don't consider answer that don't have the maximum context available (if such information is
+                    # provided).
+                    if (
+                        token_is_max_context is not None
+                        and not token_is_max_context.get(str(start_index), False)
+                    ):
+                        continue
+                    prelim_predictions.append(
+                        {
+                            "offsets": (
+                                offset_mapping[start_index][0],
+                                offset_mapping[end_index][1],
+                            ),
+                            "score": start_log_prob[i] + end_log_prob[j_index],
+                            "start_log_prob": start_log_prob[i],
+                            "end_log_prob": end_log_prob[j_index],
+                        }
+                    )
+
+        # Only keep the best `n_best_size` predictions.
+        predictions = sorted(
+            prelim_predictions, key=lambda x: x["score"], reverse=True
+        )[:n_best_size]
+
+        # Use the offsets to gather the answer text in the original context.
+        context = example["context"]
+        for pred in predictions:
+            offsets = pred.pop("offsets")
+            pred["text"] = context[offsets[0] : offsets[1]]
+
+        # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
+        # failure.
+        if len(predictions) == 0:
+            predictions.insert(
+                0,
+                {"text": "", "start_logit": -1e-6, "end_logit": -1e-6, "score": -2e-6},
+            )
+
+        # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
+        # the LogSumExp trick).
+        scores = np.array([pred.pop("score") for pred in predictions])
+        exp_scores = np.exp(scores - np.max(scores))
+        probs = exp_scores / exp_scores.sum()
+
+        # Include the probabilities in our predictions.
+        for prob, pred in zip(probs, predictions):
+            pred["probability"] = prob
+
+        # Pick the best prediction and set the probability for the null answer.
+        all_predictions[example["id"]] = predictions[0]["text"]
+        if version_2_with_negative:
+            scores_diff_json[example["id"]] = float(min_null_score)
+
+        # Make `predictions` JSON-serializable by casting np.float back to float.
+        all_nbest_json[example["id"]] = [
+            {
+                k: (
+                    float(v)
+                    if isinstance(v, (np.float16, np.float32, np.float64))
+                    else v
+                )
+                for k, v in pred.items()
+            }
+            for pred in predictions
+        ]
+
+    # If we have an output_dir, let's save all those dicts.
+    if output_dir is not None:
+        if not os.path.isdir(output_dir):
+            raise EnvironmentError(f"{output_dir} is not a directory.")
+
+        prediction_file = os.path.join(
+            output_dir,
+            "predictions.json" if prefix is None else f"{prefix}_predictions.json",
+        )
+        nbest_file = os.path.join(
+            output_dir,
+            "nbest_predictions.json"
+            if prefix is None
+            else f"{prefix}_nbest_predictions.json",
+        )
+        if version_2_with_negative:
+            null_odds_file = os.path.join(
+                output_dir,
+                "null_odds.json" if prefix is None else f"{prefix}_null_odds.json",
+            )
+
+        logger.info(f"Saving predictions to {prediction_file}.")
+        with open(prediction_file, "w") as writer:
+            writer.write(json.dumps(all_predictions, indent=4) + "\n")
+        logger.info(f"Saving nbest_preds to {nbest_file}.")
+        with open(nbest_file, "w") as writer:
+            writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+        if version_2_with_negative:
+            logger.info(f"Saving null_odds to {null_odds_file}.")
+            with open(null_odds_file, "w") as writer:
+                writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+    return all_predictions, scores_diff_json
diff --git a/examples/training/huggingface/gpt/README.md b/examples/training/huggingface/gpt/README.md
index 76bfc84a..fe80f415 100644
--- a/examples/training/huggingface/gpt/README.md
+++ b/examples/training/huggingface/gpt/README.md
@@ -7,8 +7,8 @@ We modify the language modeling [examples](https://github.com/huggingface/transf
 First you should install these requirements.
 
 ```shell
-pip install -r requirements.txt
-bash run_clm.sh
+$ pip install -r requirements.txt
+$ bash run_clm.sh
 ```
 
 Before running the script.make sure your pytorch worksfine with cuda, lightseq doesn't support pytorch cpu mode. You can verify your pytorch on CUDA by the following code.
diff --git a/examples/training/huggingface/gpt/__init__.py b/examples/training/huggingface/gpt/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/examples/training/huggingface/gpt/ls_hf_gpt_layer.py b/examples/training/huggingface/gpt/ls_hf_gpt_layer.py
new file mode 100644
index 00000000..90061766
--- /dev/null
+++ b/examples/training/huggingface/gpt/ls_hf_gpt_layer.py
@@ -0,0 +1,129 @@
+from torch import nn
+
+from lightseq.training.ops.pytorch.quantization import (
+    qat_mode,
+    disable_quant,
+    QuantLinear,
+    TensorQuantizer,
+    weight_quant_config,
+)
+from lightseq.training.ops.pytorch.torch_transformer_layers import (
+    TransformerDecoderLayer,
+    copy_para,
+)
+
+
+def get_hf_gpt_enc_layer_params(layer, config):
+    init_ws = []
+    init_bs = []
+
+    init_ws.extend(
+        layer.attn.c_attn.weight.detach().clone().t().split(config.hidden_size, 0)
+    )
+    init_bs.extend(layer.attn.c_attn.bias.detach().clone().split(config.hidden_size, 0))
+
+    init_ws.append(layer.attn.c_proj.weight.detach().clone().t().reshape(-1))
+    init_bs.append(layer.attn.c_proj.bias.detach().clone())
+    init_ws.append(layer.ln_1.weight.detach().clone())
+    init_bs.append(layer.ln_1.bias.detach().clone())
+
+    init_ws.append(layer.mlp.c_fc.weight.detach().clone().t().reshape(-1))
+    init_bs.append(layer.mlp.c_fc.bias.detach().clone())
+    init_ws.append(layer.mlp.c_proj.weight.detach().clone().t().reshape(-1))
+    init_bs.append(layer.mlp.c_proj.bias.detach().clone())
+    init_ws.append(layer.ln_2.weight.detach().clone())
+    init_bs.append(layer.ln_2.bias.detach().clone())
+
+    return init_ws, init_bs
+
+
+def get_hf_gpt_emb_layer_params(layer):
+    init_ws = []
+
+    init_ws.append(layer.wte.weight.detach().clone())
+    init_ws.append(layer.wpe.weight.detach().clone())
+
+    return init_ws
+
+
+def gen_gpt_enc_config(training_args, config):
+    gpt_enc_config = TransformerDecoderLayer.get_config(
+        max_batch_tokens=8192,
+        max_seq_len=config.max_position_embeddings,
+        hidden_size=config.hidden_size,
+        intermediate_size=4 * config.hidden_size,
+        nhead=config.num_attention_heads,
+        attn_prob_dropout_ratio=config.attn_pdrop,
+        activation_dropout_ratio=config.resid_pdrop,
+        hidden_dropout_ratio=config.resid_pdrop,
+        pre_layer_norm=True,
+        fp16=training_args.fp16,
+        local_rank=training_args.local_rank,
+        nlayer=config.num_hidden_layers,
+        activation_fn="gelu",
+        has_cross_attn=False,
+    )
+    return gpt_enc_config
+
+
+class LSHFGptEncoderLayer(TransformerDecoderLayer):
+    def __init__(self, *args, **kwargs):
+        super(LSHFGptEncoderLayer, self).__init__(*args, **kwargs)
+
+    def forward(self, hidden_states, attention_mask=None, *args, **kwargs):
+        if attention_mask is not None:
+            ls_attention_mask = attention_mask.squeeze()
+        else:
+            ls_attention_mask = torch.zeros(hidden_states.size()[:2])
+        output = super().forward(hidden_states, ls_attention_mask)
+        return output
+
+
+class GptEmbedding(nn.Embedding):
+    def __init__(self, training_args, initial_embeddings=None, *args, **kwargs):
+        super(GptEmbedding, self).__init__(*args, **kwargs)
+        self.emb_quant = TensorQuantizer(weight_quant_config)
+
+        if initial_embeddings is not None:
+            self.weight.data.copy_(copy_para(initial_embeddings, training_args.fp16))
+
+    def forward(self, input_ids):
+        x = super(GptEmbedding, self).forward(input_ids)
+        x = self.emb_quant(x)
+        return x
+
+
+def inject_ls_layer(model, training_args, model_args, config):
+    if model_args.module_type == 1:
+        from lightseq.training import ls_hf_gpt_enc_convert
+
+        ls_hf_gpt_enc_convert(model, training_args, config)
+        return
+
+    if model_args.module_type != 2:
+        raise NotImplementedError
+
+    init_ws = get_hf_gpt_emb_layer_params(model.transformer)
+    model.transformer.wte = GptEmbedding(
+        training_args, init_ws[0], config.vocab_size, config.hidden_size
+    )
+    if model_args.enable_quant:
+        model.transformer.wte.apply(qat_mode)
+    else:
+        model.transformer.wte.apply(disable_quant)
+
+    for i in range(config.num_hidden_layers):
+        gpt_enc_config = gen_gpt_enc_config(training_args, config)
+        init_ws, init_bs = get_hf_gpt_enc_layer_params(model.transformer.h[i], config)
+        model.transformer.h[i] = LSHFGptEncoderLayer(
+            gpt_enc_config, init_ws, init_bs
+        ).cuda()
+        if model_args.enable_quant:
+            model.transformer.h[i].apply(qat_mode)
+        else:
+            model.transformer.h[i].apply(disable_quant)
+
+    q_lm_head = QuantLinear(config.n_embd, config.vocab_size, bias=False)
+    q_lm_head.weight = model.transformer.wte.weight
+    q_lm_head.weight_quant = model.transformer.wte.emb_quant
+    model.lm_head = q_lm_head
diff --git a/examples/training/huggingface/gpt/run_clm.py b/examples/training/huggingface/gpt/run_clm.py
index 90b9dd8d..52dfc223 100644
--- a/examples/training/huggingface/gpt/run_clm.py
+++ b/examples/training/huggingface/gpt/run_clm.py
@@ -33,6 +33,7 @@
 import datasets
 from datasets import load_dataset
 
+import torch
 import transformers
 from transformers import (
     CONFIG_MAPPING,
@@ -50,8 +51,7 @@
 from transformers.trainer_utils import get_last_checkpoint
 from transformers.utils import check_min_version
 from transformers.utils.versions import require_version
-
-from lightseq.training import ls_hf_gpt_convert
+from ls_hf_gpt_layer import inject_ls_layer
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
@@ -133,9 +133,15 @@ class ModelArguments:
             "with private models)."
         },
     )
-    with_lightseq: bool = field(
-        default=True,
-        metadata={"help": "Whether to use lightseq"},
+    module_type: int = field(
+        default=1,
+        metadata={
+            "help": "0: original Hugging Face layer, 1: LightSeq CUDA layer, 2: custom Torch layer"
+        },
+    )
+    enable_quant: bool = field(
+        default=False,
+        metadata={"help": "Whether to enable quantization"},
     )
 
     def __post_init__(self):
@@ -436,8 +442,8 @@ def main():
         )
 
     # Replace with LightSeq encoder layers.
-    if model_args.with_lightseq:
-        ls_hf_gpt_convert(model, training_args, config)
+    if model_args.module_type == 1 or model_args.module_type == 2:
+        inject_ls_layer(model, training_args, model_args, config)
 
     model.resize_token_embeddings(len(tokenizer))
 
@@ -548,6 +554,12 @@ def group_texts(examples):
         data_collator=default_data_collator,
     )
 
+    if not training_args.do_train:
+        state_dict = torch.load(
+            training_args.resume_from_checkpoint, map_location="cpu"
+        )
+        trainer._load_state_dict_in_model(state_dict)
+
     # Training
     if training_args.do_train:
         checkpoint = None
diff --git a/examples/training/huggingface/gpt/run_clm.sh b/examples/training/huggingface/gpt/run_clm.sh
index 863a8b97..30449bc4 100644
--- a/examples/training/huggingface/gpt/run_clm.sh
+++ b/examples/training/huggingface/gpt/run_clm.sh
@@ -8,12 +8,15 @@ python3 -m torch.distributed.launch \
     --model_name_or_path gpt2 \
     --dataset_name wikitext \
     --dataset_config_name wikitext-103-raw-v1 \
-    --per_device_train_batch_size 8 \
+    --per_device_train_batch_size 16 \
     --per_device_eval_batch_size 8 \
+    --num_train_epochs 1 \
     --do_train \
     --do_eval \
     --output_dir /tmp/test-clm \
     --overwrite_output_dir \
     --fp16 \
     --logging_steps 10 \
-    --block_size 512
+    --block_size 512 \
+    --module_type 2 \
+    --enable_quant false
diff --git a/examples/training/huggingface/gpt/run_quant_clm.sh b/examples/training/huggingface/gpt/run_quant_clm.sh
new file mode 100644
index 00000000..196e6434
--- /dev/null
+++ b/examples/training/huggingface/gpt/run_quant_clm.sh
@@ -0,0 +1,23 @@
+#! /bin/bash
+
+THIS_DIR=$(dirname $(readlink -f $0))
+
+python3 -m torch.distributed.launch \
+    --nproc_per_node=1 \
+    $THIS_DIR/run_clm.py \
+    --model_name_or_path gpt2 \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-103-raw-v1 \
+    --per_device_train_batch_size 16 \
+    --per_device_eval_batch_size 8 \
+    --num_train_epochs 2 \
+    --do_train \
+    --do_eval \
+    --output_dir /tmp/quant/test-clm \
+    --overwrite_output_dir \
+    --resume_from_checkpoint /tmp/test-clm \
+    --fp16 \
+    --logging_steps 10 \
+    --block_size 512 \
+    --module_type 2 \
+    --enable_quant true
diff --git a/examples/training/huggingface/ls_hf_transformer_encoder_layer.py b/examples/training/huggingface/ls_hf_transformer_encoder_layer.py
deleted file mode 100644
index 38db61fe..00000000
--- a/examples/training/huggingface/ls_hf_transformer_encoder_layer.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import random
-
-from lightseq.training.ops.pytorch.transformer_encoder_layer import (
-    LSTransformerEncoderLayer,
-)
-
-
-class LSHFTransformerEncoderLayer(LSTransformerEncoderLayer):
-    def __init__(self, *args, **kwargs):
-        super(LSHFTransformerEncoderLayer, self).__init__(*args, **kwargs)
-
-    def forward(self, hidden_states, encoder_padding_mask, *args, **kwargs):
-        ls_encoder_padding_mask = encoder_padding_mask / -10000.0
-        ls_encoder_padding_mask = ls_encoder_padding_mask.squeeze()
-        output = super().forward(hidden_states, ls_encoder_padding_mask)
-        return (output, None, None, None)
-
-
-def gen_bert_config(training_args, config):
-    bert_config = LSTransformerEncoderLayer.get_config(
-        max_batch_tokens=4096,
-        max_seq_len=config.max_position_embeddings,
-        hidden_size=config.hidden_size,
-        intermediate_size=config.intermediate_size,
-        nhead=config.num_attention_heads,
-        attn_prob_dropout_ratio=config.attention_probs_dropout_prob,
-        activation_dropout_ratio=config.hidden_dropout_prob,
-        hidden_dropout_ratio=config.hidden_dropout_prob,
-        pre_layer_norm=False,
-        fp16=training_args.fp16,
-        local_rank=training_args.local_rank,
-        activation_fn="gelu",
-    )
-    return bert_config
-
-
-def get_hf_bert_enc_layer_params(layer):
-    init_ws = []
-    init_bs = []
-
-    init_ws.append(layer.attention.self.query.weight.detach().clone())
-    init_bs.append(layer.attention.self.query.bias.detach().clone())
-    init_ws.append(layer.attention.self.key.weight.detach().clone())
-    init_bs.append(layer.attention.self.key.bias.detach().clone())
-    init_ws.append(layer.attention.self.value.weight.detach().clone())
-    init_bs.append(layer.attention.self.value.bias.detach().clone())
-    init_ws.append(layer.attention.output.dense.weight.detach().clone())
-    init_bs.append(layer.attention.output.dense.bias.detach().clone())
-    init_ws.append(layer.attention.output.LayerNorm.weight.detach().clone())
-    init_bs.append(layer.attention.output.LayerNorm.bias.detach().clone())
-
-    init_ws.append(layer.intermediate.dense.weight.detach().clone())
-    init_bs.append(layer.intermediate.dense.bias.detach().clone())
-    init_ws.append(layer.output.dense.weight.detach().clone())
-    init_bs.append(layer.output.dense.bias.detach().clone())
-    init_ws.append(layer.output.LayerNorm.weight.detach().clone())
-    init_bs.append(layer.output.LayerNorm.bias.detach().clone())
-
-    return init_ws, init_bs
-
-
-def inject_ls_enc_layer(model, training_args, config):
-    for i in range(config.num_hidden_layers):
-        bert_config = gen_bert_config(training_args, config)
-        init_ws, init_bs = get_hf_bert_enc_layer_params(model.bert.encoder.layer[i])
-        model.bert.encoder.layer[i] = LSHFTransformerEncoderLayer(
-            bert_config, init_ws, init_bs
-        ).cuda()
diff --git a/examples/training/huggingface/run_ner_no_trainer.py b/examples/training/huggingface/run_ner_no_trainer.py
deleted file mode 100644
index 88db653b..00000000
--- a/examples/training/huggingface/run_ner_no_trainer.py
+++ /dev/null
@@ -1,618 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Fine-tuning a 🤗 Transformers model on token classification tasks (NER, POS, CHUNKS) relying on the accelerate library
-without using a Trainer.
-"""
-
-import argparse
-import logging
-import math
-import os
-import random
-
-import datasets
-import torch
-from datasets import ClassLabel, load_dataset, load_metric
-from torch.utils.data.dataloader import DataLoader
-from tqdm.auto import tqdm
-
-import transformers
-from accelerate import Accelerator
-from transformers import (
-    CONFIG_MAPPING,
-    MODEL_MAPPING,
-    AdamW,
-    AutoConfig,
-    AutoModelForTokenClassification,
-    AutoTokenizer,
-    DataCollatorForTokenClassification,
-    SchedulerType,
-    default_data_collator,
-    get_scheduler,
-    set_seed,
-)
-from ls_hf_transformer_encoder_layer import inject_ls_enc_layer
-
-logger = logging.getLogger(__name__)
-# You should update this to your particular problem to have better documentation of `model_type`
-MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
-MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(
-        description="Finetune a transformers model on a text classification task (NER) with accelerate library"
-    )
-    parser.add_argument(
-        "--dataset_name",
-        type=str,
-        default=None,
-        help="The name of the dataset to use (via the datasets library).",
-    )
-    parser.add_argument(
-        "--dataset_config_name",
-        type=str,
-        default=None,
-        help="The configuration name of the dataset to use (via the datasets library).",
-    )
-    parser.add_argument(
-        "--train_file",
-        type=str,
-        default=None,
-        help="A csv or a json file containing the training data.",
-    )
-    parser.add_argument(
-        "--validation_file",
-        type=str,
-        default=None,
-        help="A csv or a json file containing the validation data.",
-    )
-    parser.add_argument(
-        "--max_length",
-        type=int,
-        default=128,
-        help=(
-            "The maximum total input sequence length after tokenization. Sequences longer than this will be truncated,"
-            " sequences shorter will be padded if `--pad_to_max_lenght` is passed."
-        ),
-    )
-    parser.add_argument(
-        "--pad_to_max_length",
-        action="store_true",
-        help="If passed, pad all samples to `max_length`. Otherwise, dynamic padding is used.",
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        type=str,
-        help="Path to pretrained model or model identifier from huggingface.co/models.",
-        required=True,
-    )
-    parser.add_argument(
-        "--config_name",
-        type=str,
-        default=None,
-        help="Pretrained config name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        type=str,
-        default=None,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--per_device_train_batch_size",
-        type=int,
-        default=8,
-        help="Batch size (per device) for the training dataloader.",
-    )
-    parser.add_argument(
-        "--per_device_eval_batch_size",
-        type=int,
-        default=8,
-        help="Batch size (per device) for the evaluation dataloader.",
-    )
-    parser.add_argument(
-        "--learning_rate",
-        type=float,
-        default=5e-5,
-        help="Initial learning rate (after the potential warmup period) to use.",
-    )
-    parser.add_argument(
-        "--weight_decay", type=float, default=0.0, help="Weight decay to use."
-    )
-    parser.add_argument(
-        "--num_train_epochs",
-        type=int,
-        default=3,
-        help="Total number of training epochs to perform.",
-    )
-    parser.add_argument(
-        "--max_train_steps",
-        type=int,
-        default=None,
-        help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
-    )
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument(
-        "--lr_scheduler_type",
-        type=SchedulerType,
-        default="linear",
-        help="The scheduler type to use.",
-        choices=[
-            "linear",
-            "cosine",
-            "cosine_with_restarts",
-            "polynomial",
-            "constant",
-            "constant_with_warmup",
-        ],
-    )
-    parser.add_argument(
-        "--num_warmup_steps",
-        type=int,
-        default=0,
-        help="Number of steps for the warmup in the lr scheduler.",
-    )
-    parser.add_argument(
-        "--output_dir", type=str, default=None, help="Where to store the final model."
-    )
-    parser.add_argument(
-        "--seed", type=int, default=None, help="A seed for reproducible training."
-    )
-    parser.add_argument(
-        "--model_type",
-        type=str,
-        default=None,
-        help="Model type to use if training from scratch.",
-        choices=MODEL_TYPES,
-    )
-    parser.add_argument(
-        "--label_all_tokens",
-        action="store_true",
-        help="Setting labels of all special tokens to -100 and thus PyTorch will ignore them.",
-    )
-    parser.add_argument(
-        "--return_entity_level_metrics",
-        action="store_true",
-        help="Indication whether entity level metrics are to be returner.",
-    )
-    parser.add_argument(
-        "--task_name",
-        type=str,
-        default="ner",
-        choices=["ner", "pos", "chunk"],
-        help="The name of the task.",
-    )
-    parser.add_argument(
-        "--debug",
-        action="store_true",
-        help="Activate debug mode and run training only with a subset of data.",
-    )
-    args = parser.parse_args()
-
-    # Sanity checks
-    if (
-        args.task_name is None
-        and args.train_file is None
-        and args.validation_file is None
-    ):
-        raise ValueError("Need either a task name or a training/validation file.")
-    else:
-        if args.train_file is not None:
-            extension = args.train_file.split(".")[-1]
-            assert extension in [
-                "csv",
-                "json",
-            ], "`train_file` should be a csv or a json file."
-        if args.validation_file is not None:
-            extension = args.validation_file.split(".")[-1]
-            assert extension in [
-                "csv",
-                "json",
-            ], "`validation_file` should be a csv or a json file."
-
-    if args.output_dir is not None:
-        os.makedirs(args.output_dir, exist_ok=True)
-
-    return args
-
-
-def main():
-    args = parse_args()
-
-    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
-    accelerator = Accelerator()
-    # Make one log on every process with the configuration for debugging.
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO,
-    )
-    logger.info(accelerator.state)
-
-    # Setup logging, we only want one process per machine to log things on the screen.
-    # accelerator.is_local_main_process is only True for one process per machine.
-    logger.setLevel(
-        logging.INFO if accelerator.is_local_main_process else logging.ERROR
-    )
-    if accelerator.is_local_main_process:
-        datasets.utils.logging.set_verbosity_warning()
-        transformers.utils.logging.set_verbosity_info()
-    else:
-        datasets.utils.logging.set_verbosity_error()
-        transformers.utils.logging.set_verbosity_error()
-
-    # If passed along, set the training seed now.
-    if args.seed is not None:
-        set_seed(args.seed)
-
-    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
-    # or just provide the name of one of the public datasets for token classification task available on the hub at https://huggingface.co/datasets/
-    # (the dataset will be downloaded automatically from the datasets Hub).
-    #
-    # For CSV/JSON files, this script will use the column called 'tokens' or the first column if no column called
-    # 'tokens' is found. You can easily tweak this behavior (see below).
-    #
-    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
-    # download the dataset.
-    if args.dataset_name is not None:
-        # Downloading and loading a dataset from the hub.
-        raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
-    else:
-        data_files = {}
-        if args.train_file is not None:
-            data_files["train"] = args.train_file
-        if args.validation_file is not None:
-            data_files["validation"] = args.validation_file
-        extension = args.train_file.split(".")[-1]
-        raw_datasets = load_dataset(extension, data_files=data_files)
-    # Trim a number of training examples
-    if args.debug:
-        for split in raw_datasets.keys():
-            raw_datasets[split] = raw_datasets[split].select(range(100))
-    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
-
-    if raw_datasets["train"] is not None:
-        column_names = raw_datasets["train"].column_names
-        features = raw_datasets["train"].features
-    else:
-        column_names = raw_datasets["validation"].column_names
-        features = raw_datasets["validation"].features
-    text_column_name = "tokens" if "tokens" in column_names else column_names[0]
-    label_column_name = (
-        f"{args.task_name}_tags"
-        if f"{args.task_name}_tags" in column_names
-        else column_names[1]
-    )
-
-    # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the
-    # unique labels.
-    def get_label_list(labels):
-        unique_labels = set()
-        for label in labels:
-            unique_labels = unique_labels | set(label)
-        label_list = list(unique_labels)
-        label_list.sort()
-        return label_list
-
-    if isinstance(features[label_column_name].feature, ClassLabel):
-        label_list = features[label_column_name].feature.names
-        # No need to convert the labels since they are already ints.
-        label_to_id = {i: i for i in range(len(label_list))}
-    else:
-        label_list = get_label_list(raw_datasets["train"][label_column_name])
-        label_to_id = {l: i for i, l in enumerate(label_list)}
-    num_labels = len(label_list)
-
-    # Load pretrained model and tokenizer
-    #
-    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-    if args.config_name:
-        config = AutoConfig.from_pretrained(args.config_name, num_labels=num_labels)
-    elif args.model_name_or_path:
-        config = AutoConfig.from_pretrained(
-            args.model_name_or_path, num_labels=num_labels
-        )
-    else:
-        config = CONFIG_MAPPING[args.model_type]()
-        logger.warning("You are instantiating a new config instance from scratch.")
-
-    if args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=True)
-    elif args.model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(
-            args.model_name_or_path, use_fast=True
-        )
-    else:
-        raise ValueError(
-            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
-            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
-        )
-
-    if args.model_name_or_path:
-        model = AutoModelForTokenClassification.from_pretrained(
-            args.model_name_or_path,
-            from_tf=bool(".ckpt" in args.model_name_or_path),
-            config=config,
-        )
-    else:
-        logger.info("Training new model from scratch")
-        model = AutoModelForTokenClassification.from_config(config)
-
-    model.resize_token_embeddings(len(tokenizer))
-
-    # Replace with LightSeq encoder layers.
-    args.local_rank = accelerator.local_process_index
-    args.fp16 = accelerator.use_fp16
-    inject_ls_enc_layer(model, args, config)
-
-    # Preprocessing the raw_datasets.
-    # First we tokenize all the texts.
-    padding = "max_length" if args.pad_to_max_length else False
-
-    # Tokenize all texts and align the labels with them.
-
-    def tokenize_and_align_labels(examples):
-        tokenized_inputs = tokenizer(
-            examples[text_column_name],
-            max_length=args.max_length,
-            padding=padding,
-            truncation=True,
-            # We use this argument because the texts in our dataset are lists of words (with a label for each word).
-            is_split_into_words=True,
-        )
-
-        labels = []
-        for i, label in enumerate(examples[label_column_name]):
-            word_ids = tokenized_inputs.word_ids(batch_index=i)
-            previous_word_idx = None
-            label_ids = []
-            for word_idx in word_ids:
-                # Special tokens have a word id that is None. We set the label to -100 so they are automatically
-                # ignored in the loss function.
-                if word_idx is None:
-                    label_ids.append(-100)
-                # We set the label for the first token of each word.
-                elif word_idx != previous_word_idx:
-                    label_ids.append(label_to_id[label[word_idx]])
-                # For the other tokens in a word, we set the label to either the current label or -100, depending on
-                # the label_all_tokens flag.
-                else:
-                    label_ids.append(
-                        label_to_id[label[word_idx]] if args.label_all_tokens else -100
-                    )
-                previous_word_idx = word_idx
-
-            labels.append(label_ids)
-        tokenized_inputs["labels"] = labels
-        return tokenized_inputs
-
-    processed_raw_datasets = raw_datasets.map(
-        tokenize_and_align_labels,
-        batched=True,
-        remove_columns=raw_datasets["train"].column_names,
-    )
-
-    train_dataset = processed_raw_datasets["train"]
-    eval_dataset = processed_raw_datasets["validation"]
-
-    # DataLoaders creation:
-    if args.pad_to_max_length:
-        # If padding was already done ot max length, we use the default data collator that will just convert everything
-        # to tensors.
-        data_collator = default_data_collator
-    else:
-        # Otherwise, `DataCollatorForTokenClassification` will apply dynamic padding for us (by padding to the maximum length of
-        # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
-        # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
-        data_collator = DataCollatorForTokenClassification(
-            tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)
-        )
-
-    train_dataloader = DataLoader(
-        train_dataset,
-        shuffle=True,
-        collate_fn=data_collator,
-        batch_size=args.per_device_train_batch_size,
-    )
-    eval_dataloader = DataLoader(
-        eval_dataset,
-        collate_fn=data_collator,
-        batch_size=args.per_device_eval_batch_size,
-    )
-
-    # Optimizer
-    # Split weights in two groups, one with weight decay and the other not.
-    no_decay = ["bias", "LayerNorm.weight"]
-    optimizer_grouped_parameters = [
-        {
-            "params": [
-                p
-                for n, p in model.named_parameters()
-                if not any(nd in n for nd in no_decay)
-            ],
-            "weight_decay": args.weight_decay,
-        },
-        {
-            "params": [
-                p
-                for n, p in model.named_parameters()
-                if any(nd in n for nd in no_decay)
-            ],
-            "weight_decay": 0.0,
-        },
-    ]
-    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
-
-    # Use the device given by the `accelerator` object.
-    device = accelerator.device
-    model.to(device)
-
-    # Prepare everything with our `accelerator`.
-    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
-        model, optimizer, train_dataloader, eval_dataloader
-    )
-
-    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
-    # shorter in multiprocess)
-
-    # Scheduler and math around the number of training steps.
-    num_update_steps_per_epoch = math.ceil(
-        len(train_dataloader) / args.gradient_accumulation_steps
-    )
-    if args.max_train_steps is None:
-        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
-    else:
-        args.num_train_epochs = math.ceil(
-            args.max_train_steps / num_update_steps_per_epoch
-        )
-
-    lr_scheduler = get_scheduler(
-        name=args.lr_scheduler_type,
-        optimizer=optimizer,
-        num_warmup_steps=args.num_warmup_steps,
-        num_training_steps=args.max_train_steps,
-    )
-
-    # Metrics
-    metric = load_metric("seqeval")
-
-    def get_labels(predictions, references):
-        # Transform predictions and references tensos to numpy arrays
-        if device.type == "cpu":
-            y_pred = predictions.detach().clone().numpy()
-            y_true = references.detach().clone().numpy()
-        else:
-            y_pred = predictions.detach().cpu().clone().numpy()
-            y_true = references.detach().cpu().clone().numpy()
-
-        # Remove ignored index (special tokens)
-        true_predictions = [
-            [label_list[p] for (p, l) in zip(pred, gold_label) if l != -100]
-            for pred, gold_label in zip(y_pred, y_true)
-        ]
-        true_labels = [
-            [label_list[l] for (p, l) in zip(pred, gold_label) if l != -100]
-            for pred, gold_label in zip(y_pred, y_true)
-        ]
-        return true_predictions, true_labels
-
-    def compute_metrics():
-        results = metric.compute()
-        if args.return_entity_level_metrics:
-            # Unpack nested dictionaries
-            final_results = {}
-            for key, value in results.items():
-                if isinstance(value, dict):
-                    for n, v in value.items():
-                        final_results[f"{key}_{n}"] = v
-                else:
-                    final_results[key] = value
-            return final_results
-        else:
-            return {
-                "precision": results["overall_precision"],
-                "recall": results["overall_recall"],
-                "f1": results["overall_f1"],
-                "accuracy": results["overall_accuracy"],
-            }
-
-    # Train!
-    total_batch_size = (
-        args.per_device_train_batch_size
-        * accelerator.num_processes
-        * args.gradient_accumulation_steps
-    )
-
-    logger.info("***** Running training *****")
-    logger.info(f"  Num examples = {len(train_dataset)}")
-    logger.info(f"  Num Epochs = {args.num_train_epochs}")
-    logger.info(
-        f"  Instantaneous batch size per device = {args.per_device_train_batch_size}"
-    )
-    logger.info(
-        f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
-    )
-    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
-    logger.info(f"  Total optimization steps = {args.max_train_steps}")
-    # Only show the progress bar once on each machine.
-    progress_bar = tqdm(
-        range(args.max_train_steps), disable=not accelerator.is_local_main_process
-    )
-    completed_steps = 0
-
-    for epoch in range(args.num_train_epochs):
-        model.train()
-        for step, batch in enumerate(train_dataloader):
-            outputs = model(**batch)
-            loss = outputs.loss
-            loss = loss / args.gradient_accumulation_steps
-            accelerator.backward(loss)
-            if (
-                step % args.gradient_accumulation_steps == 0
-                or step == len(train_dataloader) - 1
-            ):
-                optimizer.step()
-                lr_scheduler.step()
-                optimizer.zero_grad()
-                progress_bar.update(1)
-                completed_steps += 1
-
-            if completed_steps >= args.max_train_steps:
-                break
-
-        model.eval()
-        for step, batch in enumerate(eval_dataloader):
-            with torch.no_grad():
-                outputs = model(**batch)
-            predictions = outputs.logits.argmax(dim=-1)
-            labels = batch["labels"]
-            if (
-                not args.pad_to_max_length
-            ):  # necessary to pad predictions and labels for being gathered
-                predictions = accelerator.pad_across_processes(
-                    predictions, dim=1, pad_index=-100
-                )
-                labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)
-
-            predictions_gathered = accelerator.gather(predictions)
-            labels_gathered = accelerator.gather(labels)
-            preds, refs = get_labels(predictions_gathered, labels_gathered)
-            metric.add_batch(
-                predictions=preds,
-                references=refs,
-            )  # predictions and preferences are expected to be a nested list of labels, not label_ids
-
-        eval_metric = metric.compute()
-        # eval_metric = compute_metrics()
-        accelerator.print(f"epoch {epoch}:", eval_metric)
-
-    if args.output_dir is not None:
-        accelerator.wait_for_everyone()
-        unwrapped_model = accelerator.unwrap_model(model)
-        unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/training/neurst/README.md b/examples/training/neurst/README.md
index bb3ded25..d755155e 100644
--- a/examples/training/neurst/README.md
+++ b/examples/training/neurst/README.md
@@ -3,27 +3,27 @@ This repo contains an example for how to use LightSeq to accerate the training o
 
 First you should install these requirements.
 ```shell
-pip install subword-nmt pyyaml sacrebleu sacremoses
-git clone https://github.com/moses-smt/mosesdecoder.git
+$ pip install subword-nmt pyyaml sacrebleu sacremoses
+$ git clone https://github.com/moses-smt/mosesdecoder.git
 ```
 Then clone NeurST and switch to lightseq branch.
 ```shell
-git clone https://github.com/bytedance/neurst.git
-cd neurst/
-git checkout lightseq
-pip install -e .
+$ git clone https://github.com/bytedance/neurst.git
+$ cd neurst/
+$ git checkout lightseq
+$ pip install -e .
 ```
 Install lightseq
 ```shell
-pip install http://sf3-ttcdn-tos.pstatp.com/obj/nlp-opensource/lightseq/tensorflow/lightseq_tf-2.0.1-cp37-cp37m-linux_x86_64.whl
+$ pip install http://sf3-ttcdn-tos.pstatp.com/obj/nlp-opensource/lightseq/tensorflow/lightseq_tf-2.0.1-cp37-cp37m-linux_x86_64.whl
 ```
 Download and preprocess data
 ```shell
-./examples/translation/prepare-wmt14en2de-bpe.sh ../mosesdecoder
+$ ./examples/translation/prepare-wmt14en2de-bpe.sh ../mosesdecoder
 ```
 Traing the model
 ```shell
-python3 -m neurst.cli.run_exp \
+$ python3 -m neurst.cli.run_exp \
     --config_paths wmt14_en_de/training_args.yml,wmt14_en_de/translation_bpe.yml \
     --hparams_set transformer_base \
     --model_dir wmt14_en_de/benchmark_base \
diff --git a/examples/triton_backend/README.md b/examples/triton_backend/README.md
index c4eed7d1..2ab191da 100644
--- a/examples/triton_backend/README.md
+++ b/examples/triton_backend/README.md
@@ -21,11 +21,11 @@
 
 - The meaning of parameters in config.pbtxt, more information you can find in [Model config of tritonbackend](https://github.com/triton-inference-server/common/blob/main/protobuf/model_config.proto)
 
-  > ${name}: name of model，**which should be same with <model_name_vid>**
+  > ${name}: name of model, **which should be same with <model_name_vid>**
   >
-  > ${backend}: **fixed value - "lightseq"**，which is used to recognize the dynamic link library of tritonbackend,  libtriton_lightseq.so
+  > ${backend}: **fixed value - "lightseq"**, which is used to recognize the dynamic link library of tritonbackend,  libtriton_lightseq.so
   >
-  > ${default_model_filename}: name of model file，**which should be same with <model_file>**
+  > ${default_model_filename}: name of model file, **which should be same with <model_file>**
   >
   > ${parameters - value - string_value}: the type of model, which should be supported by lightseq. You can choose `Transformer`|`QuantTransformer`|`Bert`|`Gpt`|`Moe`
 
diff --git a/lightseq/inference/README.md b/lightseq/inference/README.md
index 24b2bfad..0819db21 100644
--- a/lightseq/inference/README.md
+++ b/lightseq/inference/README.md
@@ -65,15 +65,15 @@ More results is available [here](../../docs/inference/performance.md).
 We provide an end2end bart-base example to see how fast Lightseq is compared to HuggingFace. First you should install these requirements.
 
 ```shell
-pip install torch tensorflow transformers lightseq
-cd examples/inference/python
+$ pip install torch tensorflow transformers lightseq
+$ cd examples/inference/python
 ```
 
 then you can check the performance by simply running following commands. `hf_bart_export.py` is used to transform pytorch weights to LightSeq protobuffer.
 
 ```shell
-python export/hf_bart_export.py
-python test/ls_bart.py
+$ python export/huggingface/hf_bart_export.py
+$ python test/ls_bart.py
 ```
 
 on our Tesla V100 we can get following output, 10x speedup have been obtained from running LightSeq rather than HuggingFace.
@@ -97,7 +97,7 @@ Nothing's gonna change my love for you.
 Drop everything now. Meet me in the pouring rain. Kiss me on the sidewalk.
 ```
 
-LightSeq installation from pypi only supports python 3.6 to 3.8 on Linux for now. Consider compiling from source if you have other environments.
+LightSeq installation from PyPI only supports python 3.6 to 3.8 on Linux for now. Consider compiling from source if you have other environments.
 
 And there is also a quick start for huggingface GPT in examples.
 
@@ -108,8 +108,8 @@ We provide python api to call lightseq, all you need is to install `lightseq` wi
 And check these files `lightseq/inference/proto/*.proto` to prepare your model weights. We provide an example weight file for you to test.
 
 ```shell
-curl -OL https://github.com/bytedance/lightseq/releases/download/v0.0.1/transformer_weight.tar.gz
-tar -zxvf transformer_weight.tar.gz
+$ curl -OL https://github.com/bytedance/lightseq/releases/download/v0.0.1/transformer_weight.tar.gz
+$ tar -zxvf transformer_weight.tar.gz
 ```
 
 Finally you can run lightseq in only a few lines!
@@ -138,12 +138,12 @@ To avoid problems caused by inconsistent environments, you can use the pre-built
 [nvidia-docker](https://github.com/NVIDIA/nvidia-docker) and make your GPU driver version >= 410.48
 
 ```shell
-docker pull nvcr.io/nvidia/tensorrtserver:19.05-py3
+$ docker pull nvcr.io/nvidia/tensorrtserver:19.05-py3
 #
-docker run --gpus '"device=0"' -it --rm -p8000:8000 -p8001:8001 -p8002:8002 -v
+$ docker run --gpus '"device=0"' -it --rm -p8000:8000 -p8001:8001 -p8002:8002 -v
 /${current}/${path}:/quick_start nvcr.io/nvidia/tensorrtserver:19.05-py3 /bin/bash
 # inside container
-cd /quick_start
+$ cd /quick_start
 ```
 
 ### Use our pre-build lib
@@ -154,8 +154,8 @@ version, we will upload binary executable example and dynamic link library of mo
 custom backend of TRTIS.
 
 ```shell
-wget https://github.com/bytedance/lightseq/releases/download/${VERSION}/${VERSION}_libs.tar.gz
-tar -zxvf ${VERSION}_libs.tar.gz
+$ wget https://github.com/bytedance/lightseq/releases/download/${VERSION}/${VERSION}_libs.tar.gz
+$ tar -zxvf ${VERSION}_libs.tar.gz
 ```
 
 ### Run local inference demo
@@ -164,12 +164,12 @@ To run local inference demo, you need to prepare model weights saved in custom p
 LightSeq and input token ids. We provide a GPT-LM model and its corresponding input token ids:
 
 ```shell
-wget https://github.com/bytedance/lightseq/releases/download/v0.0.1/v0.0.1_gptlm.pkg.tar.gz
-tar -zxvf v0.0.1_gptlm.pkg.tar.gz
+$ wget https://github.com/bytedance/lightseq/releases/download/v0.0.1/v0.0.1_gptlm.pkg.tar.gz
+$ tar -zxvf v0.0.1_gptlm.pkg.tar.gz
 # fp32 example
-./{VERSION}_libs/gptlm_example.fp32 ./v0.0.1_gptlm.pkg/gpt.pb ./v0.0.1_gptlm.pkg/test_case
+$ ./{VERSION}_libs/gptlm_example.fp32 ./v0.0.1_gptlm.pkg/gpt.pb ./v0.0.1_gptlm.pkg/test_case
 # fp16 example
-./{VERSION}_libs/gptlm_example.fp16 ./v0.0.1_gptlm.pkg/gpt.pb ./v0.0.1_gptlm.pkg/test_case
+$ ./{VERSION}_libs/gptlm_example.fp16 ./v0.0.1_gptlm.pkg/gpt.pb ./v0.0.1_gptlm.pkg/test_case
 ```
 
 To run the end-to-end model server based on TRTIS, you need to prepare a custom backend [model
@@ -187,15 +187,15 @@ models/
 With the pre-built libraries and example weights mentioned above, you can easily run a server:
 
 ```shell
-mkdir -p ./model_zoo/gptlm/1
-wget https://github.com/bytedance/lightseq/releases/download/v0.0.1/v0.0.1_gptlm.config.pbtxt
-mv v0.0.1_gptlm.config.pbtxt model_zoo/gptlm/config.pbtxt
-cp ./v0.0.1_gptlm.pkg/gpt.pb model_zoo/gptlm/gpt.pb
-cp ./{VERSION}_libs/libgptlm.so.fp32 model_zoo/gptlm/1/libgptlm.so
+$ mkdir -p ./model_zoo/gptlm/1
+$ wget https://github.com/bytedance/lightseq/releases/download/v0.0.1/v0.0.1_gptlm.config.pbtxt
+$ mv v0.0.1_gptlm.config.pbtxt model_zoo/gptlm/config.pbtxt
+$ cp ./v0.0.1_gptlm.pkg/gpt.pb model_zoo/gptlm/gpt.pb
+$ cp ./{VERSION}_libs/libgptlm.so.fp32 model_zoo/gptlm/1/libgptlm.so
 # or fp16 server
 # cp ./{VERSION}_libs/libgptlm.so.fp16 model_zoo/gptlm/1/libgptlm.so
-export MODEL_ZOO="/quick_start/model_zoo"
-trtserver --model-store=${MODEL_ZOO}
+$ export MODEL_ZOO="/quick_start/model_zoo"
+$ trtserver --model-store=${MODEL_ZOO}
 ```
 
 After starting server, Invoking the [TRTIS
diff --git a/lightseq/inference/kernels/CMakeLists.txt b/lightseq/inference/kernels/CMakeLists.txt
index 5f647bcd..b9cebb32 100644
--- a/lightseq/inference/kernels/CMakeLists.txt
+++ b/lightseq/inference/kernels/CMakeLists.txt
@@ -2,6 +2,7 @@ cmake_minimum_required(VERSION 3.18)
 
 set(cuda_kernel_files
     gptKernels.cc.cu
+    gptKernels_int8.cc.cu
     transformerKernels.cc.cu
     multilgKernels.cc.cu
     embKernels.cc.cu
diff --git a/lightseq/inference/kernels/embKernels_int8.cc.cu b/lightseq/inference/kernels/embKernels_int8.cc.cu
index bade6241..28251303 100644
--- a/lightseq/inference/kernels/embKernels_int8.cc.cu
+++ b/lightseq/inference/kernels/embKernels_int8.cc.cu
@@ -14,7 +14,8 @@ template <typename T>
 __global__ void ker_enc_emb_i8I(const int8_t *token_emb, const T *pos_emb,
                                 const int *tokens, T *output, int *pad_mask,
                                 int pad_id, int batch_size, int seq_len,
-                                int hidden_dim, float dequant_scale) {
+                                int hidden_dim, float dequant_scale,
+                                bool scaled) {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx >= batch_size * seq_len * hidden_dim) {
     return;
@@ -39,7 +40,8 @@ __global__ void ker_enc_emb_i8I(const int8_t *token_emb, const T *pos_emb,
     }
     char4 value_i4 = ((char4 *)token_emb)[token * hidden_dim + dim_idx];
     float4 pemb = ((float4 *)pos_emb)[seq_idx * hidden_dim + dim_idx];
-    float scale = dequant_scale * sqrtf(hidden_dim << 2);
+    float scale = dequant_scale;
+    if (scaled) scale *= sqrtf(hidden_dim << 2);
     value.x = float(value_i4.x) * scale + pemb.x;
     value.y = float(value_i4.y) * scale + pemb.y;
     value.z = float(value_i4.z) * scale + pemb.z;
@@ -49,12 +51,10 @@ __global__ void ker_enc_emb_i8I(const int8_t *token_emb, const T *pos_emb,
 }
 
 template <>
-__global__ void ker_enc_emb_i8I<__half>(const int8_t *token_emb,
-                                        const __half *pos_emb,
-                                        const int *tokens, __half *output,
-                                        int *pad_mask, int pad_id,
-                                        int batch_size, int seq_len,
-                                        int hidden_dim, float dequant_scale) {
+__global__ void ker_enc_emb_i8I<__half>(
+    const int8_t *token_emb, const __half *pos_emb, const int *tokens,
+    __half *output, int *pad_mask, int pad_id, int batch_size, int seq_len,
+    int hidden_dim, float dequant_scale, bool scaled) {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx >= batch_size * seq_len * hidden_dim) {
     return;
@@ -82,7 +82,8 @@ __global__ void ker_enc_emb_i8I<__half>(const int8_t *token_emb,
     __half2 *value_h2 = (__half2 *)(&value);
     char2 *value_i2 = (char2 *)(&value_i8);
     __half2 *pemb_h2 = (__half2 *)(&pemb);
-    float scale = dequant_scale * sqrtf(hidden_dim << 3);
+    float scale = dequant_scale;
+    if (scaled) scale *= sqrtf(hidden_dim << 3);
 #pragma unroll
     for (int i = 0; i < 4; i++) {
       float2 value_f2;
@@ -101,7 +102,7 @@ void launch_enc_emb_i8I(const int8_t *token_emb, const T *pos_emb,
                         int batch_size, int seq_len, int hidden_dim,
                         cudaStream_t stream, const T *lang_emb,
                         const int *lang_id, int multilg_type,
-                        float dequant_scale) {
+                        float dequant_scale, bool scaled) {
   if (hidden_dim % 4 != 0) {
     throw std::runtime_error("violate hidden_dim % 4 = 0");
   }
@@ -111,7 +112,7 @@ void launch_enc_emb_i8I(const int8_t *token_emb, const T *pos_emb,
   if (multilg_type == 0) {
     ker_enc_emb_i8I<T><<<nblock, MAX_THREADS, 0, stream>>>(
         token_emb, pos_emb, tokens, output, pad_mask, pad_id, batch_size,
-        seq_len, hidden_dim, dequant_scale);
+        seq_len, hidden_dim, dequant_scale, scaled);
   } else {
     throw std::runtime_error("multilingle not supported");
   }
@@ -124,7 +125,7 @@ void launch_enc_emb_i8I<__half>(const int8_t *token_emb, const __half *pos_emb,
                                 int seq_len, int hidden_dim,
                                 cudaStream_t stream, const __half *lang_emb,
                                 const int *lang_id, int multilg_type,
-                                float dequant_scale) {
+                                float dequant_scale, bool scaled) {
   if (hidden_dim % 8 != 0) {
     throw std::runtime_error("violate hidden_dim % 8 = 0");
   }
@@ -135,7 +136,7 @@ void launch_enc_emb_i8I<__half>(const int8_t *token_emb, const __half *pos_emb,
   if (multilg_type == 0) {
     ker_enc_emb_i8I<__half><<<nblock, MAX_THREADS, 0, stream>>>(
         token_emb, pos_emb, tokens, output, pad_mask, pad_id, batch_size,
-        seq_len, hidden_dim, dequant_scale);
+        seq_len, hidden_dim, dequant_scale, scaled);
   } else {
     throw std::runtime_error("multilingle not supported");
   }
@@ -145,13 +146,13 @@ template void launch_enc_emb_i8I<float>(
     const int8_t *token_emb, const float *pos_emb, const int *tokens,
     float *output, int *pad_mask, int pad_id, int batch_size, int seq_len,
     int hidden_dim, cudaStream_t stream, const float *lang_emb,
-    const int *lang_id, int multilg_type, float dequant_scale);
+    const int *lang_id, int multilg_type, float dequant_scale, bool scaled);
 
 template void launch_enc_emb_i8I<__half>(
     const int8_t *token_emb, const __half *pos_emb, const int *tokens,
     __half *output, int *pad_mask, int pad_id, int batch_size, int seq_len,
     int hidden_dim, cudaStream_t stream, const __half *lang_emb,
-    const int *lang_id, int multilg_type, float dequant_scale);
+    const int *lang_id, int multilg_type, float dequant_scale, bool scaled);
 
 template <typename T>
 __global__ void ker_dec_emb_i8I(const int8_t *token_emb, const T *pos_emb,
@@ -159,7 +160,7 @@ __global__ void ker_dec_emb_i8I(const int8_t *token_emb, const T *pos_emb,
                                 const int *lang_id, T *output, int batch_size,
                                 int beam_size, int hidden_dim, int vocab_size,
                                 int step, int max_step, int multilg_type,
-                                float dequant_scale) {
+                                float dequant_scale, bool scaled) {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx >= batch_size * beam_size * hidden_dim) {
     return;
@@ -170,8 +171,10 @@ __global__ void ker_dec_emb_i8I(const int8_t *token_emb, const T *pos_emb,
   int8_t emb;
   int token = tokens[flat_3dim(batch_idx, beam_idx, step, beam_size, max_step)];
   emb = token_emb[flat_2dim(dim_idx, token, vocab_size)];
-  float value = float(emb) * dequant_scale * sqrtf(hidden_dim) +
-                float(pos_emb[flat_2dim(step, dim_idx, hidden_dim)]);
+  float scale = dequant_scale;
+  if (scaled) scale *= sqrtf(hidden_dim);
+  float value =
+      float(emb) * scale + float(pos_emb[flat_2dim(step, dim_idx, hidden_dim)]);
   output[idx] = T(value);
 }
 
@@ -181,7 +184,7 @@ void launch_dec_emb_i8I(const int8_t *token_emb, const T *pos_emb, int *tokens,
                         int batch_size, int beam_size, int hidden_dim,
                         int vocab_size, int step, int max_step,
                         int multilg_type, cudaStream_t stream,
-                        float dequant_scale) {
+                        float dequant_scale, bool scaled) {
   if (step >= max_step) {
     throw std::runtime_error("violate step < max_step");
   }
@@ -193,19 +196,19 @@ void launch_dec_emb_i8I(const int8_t *token_emb, const T *pos_emb, int *tokens,
   ker_dec_emb_i8I<T><<<nblock, MAX_THREADS, 0, stream>>>(
       token_emb, pos_emb, tokens, lang_emb, lang_id, output, batch_size,
       beam_size, hidden_dim, vocab_size, step, max_step, multilg_type,
-      dequant_scale);
+      dequant_scale, scaled);
 }
 
 template void launch_dec_emb_i8I<float>(
     const int8_t *token_emb, const float *pos_emb, int *tokens,
     const float *lang_emb, const int *lang_id, float *output, int batch_size,
     int beam_size, int hidden_dim, int vocab_size, int step, int max_step,
-    int multilg_type, cudaStream_t stream, float dequant_scale);
+    int multilg_type, cudaStream_t stream, float dequant_scale, bool scaled);
 
 template void launch_dec_emb_i8I<__half>(
     const int8_t *token_emb, const __half *pos_emb, int *tokens,
     const __half *lang_emb, const int *lang_id, __half *output, int batch_size,
     int beam_size, int hidden_dim, int vocab_size, int step, int max_step,
-    int multilg_type, cudaStream_t stream, float dequant_scale);
+    int multilg_type, cudaStream_t stream, float dequant_scale, bool scaled);
 }  // namespace cuda
 }  // namespace lightseq
diff --git a/lightseq/inference/kernels/embKernels_int8.h b/lightseq/inference/kernels/embKernels_int8.h
index 6ec8fde1..a914f9f1 100644
--- a/lightseq/inference/kernels/embKernels_int8.h
+++ b/lightseq/inference/kernels/embKernels_int8.h
@@ -11,7 +11,7 @@ void launch_enc_emb_i8I(const int8_t *token_emb, const T *pos_emb,
                         int batch_size, int seq_len, int hidden_dim,
                         cudaStream_t stream, const T *lang_emb,
                         const int *lang_id, int multilg_type,
-                        float dequant_scale);
+                        float dequant_scale, bool scaled = true);
 
 template <typename T>
 void launch_dec_emb_i8I(const int8_t *token_emb, const T *pos_emb, int *tokens,
@@ -19,7 +19,7 @@ void launch_dec_emb_i8I(const int8_t *token_emb, const T *pos_emb, int *tokens,
                         int batch_size, int beam_size, int hidden_dim,
                         int vocab_size, int step, int max_step,
                         int multilg_type, cudaStream_t stream,
-                        float dequant_scale);
+                        float dequant_scale, bool scaled = true);
 
 }  // namespace cuda
 }  // namespace lightseq
diff --git a/lightseq/inference/kernels/gptKernels_int8.cc.cu b/lightseq/inference/kernels/gptKernels_int8.cc.cu
new file mode 100644
index 00000000..286193f2
--- /dev/null
+++ b/lightseq/inference/kernels/gptKernels_int8.cc.cu
@@ -0,0 +1,866 @@
+#include <random>
+
+#include "common.h"
+#include "gptKernels_int8.h"
+#include "transformerKernels.h"
+/**
+@file
+Implemented the cuda kernel function and its launcher
+that required by GPT model.
+Currently, fp16 and fp32 versions are provided
+*/
+namespace lightseq {
+namespace cuda {
+__forceinline__ __device__ int8_t float2int8(float x, float quant_scale) {
+  float i8_f = x * quant_scale;
+  int32_t i8 = floorf(i8_f + 0.5);
+  i8 = i8 < -127 ? -127 : (i8 > 127 ? 127 : i8);
+  return int8_t(i8);
+}
+
+template <typename T>
+__global__ void ker_gpt_embedding_int8(const int8_t* token_emb,
+                                       const T* pos_emb, const int* token_id,
+                                       T* output, int* real_seq_len,
+                                       int padding_id, int pos_offset,
+                                       float dequant_scale) {
+  int target_pos = blockIdx.x * gridDim.y + blockIdx.y;
+  int tid = token_id[target_pos];
+  if (tid == padding_id) {
+    // for padding id
+    output[target_pos * blockDim.x + threadIdx.x] = 0.f;
+    return;
+  }
+  if (threadIdx.x == 0) {
+    atomicAdd(real_seq_len + blockIdx.x, 1);
+  }
+  output[target_pos * blockDim.x + threadIdx.x] =
+      T(token_emb[tid * blockDim.x + threadIdx.x]) * dequant_scale +
+      pos_emb[(blockIdx.y + pos_offset) * blockDim.x + threadIdx.x];
+}
+
+/* fp16 version */
+template <>
+__global__ void ker_gpt_embedding_int8<__half>(
+    const int8_t* token_emb, const __half* pos_emb, const int* token_id,
+    __half* output, int* real_seq_len, int padding_id, int pos_offset,
+    float dequant_scale) {
+  int target_pos = blockIdx.x * gridDim.y + blockIdx.y;
+  int tid = token_id[target_pos];
+  half2* output_h = (half2*)output;
+
+  if (tid == padding_id) {
+    // for padding id
+    output_h[target_pos * blockDim.x + threadIdx.x] = __float2half2_rn(0.f);
+    return;
+  }
+  if (threadIdx.x == 0) {
+    atomicAdd(real_seq_len + blockIdx.x, 1);
+  }
+
+  float2 te;
+  char2 cte = ((const char2*)token_emb)[tid * blockDim.x + threadIdx.x];
+  float2 pe = __half22float2(
+      ((const half2*)
+           pos_emb)[(blockIdx.y + pos_offset) * blockDim.x + threadIdx.x]);
+  te.x = float(cte.x) * dequant_scale + pe.x;
+  te.y = float(cte.y) * dequant_scale + pe.y;
+  output_h[target_pos * blockDim.x + threadIdx.x] = __float22half2_rn(te);
+}
+
+template <typename T>
+void ker_gpt_embedding_i8I_launcher(int batch_size, int batch_seq_len,
+                                    int hidden_size, cudaStream_t stream,
+                                    const int8_t* token_emb, const T* pos_emb,
+                                    const int* token_id, T* output,
+                                    int* real_seq_len, int padding_id,
+                                    int pos_offset, float dequant_scale) {
+  ker_gpt_embedding_int8<T>
+      <<<dim3(batch_size, batch_seq_len), hidden_size, 0, stream>>>(
+          token_emb, pos_emb, token_id, output, real_seq_len, padding_id,
+          pos_offset, dequant_scale);
+}
+
+template <>
+void ker_gpt_embedding_i8I_launcher<__half>(
+    int batch_size, int batch_seq_len, int hidden_size, cudaStream_t stream,
+    const int8_t* token_emb, const __half* pos_emb, const int* token_id,
+    __half* output, int* real_seq_len, int padding_id, int pos_offset,
+    float dequant_scale) {
+  ker_gpt_embedding_int8<__half>
+      <<<dim3(batch_size, batch_seq_len), hidden_size / 2, 0, stream>>>(
+          token_emb, pos_emb, token_id, output, real_seq_len, padding_id,
+          pos_offset, dequant_scale);
+}
+
+template void ker_gpt_embedding_i8I_launcher<float>(
+    int batch_size, int batch_seq_len, int hidden_size, cudaStream_t stream,
+    const int8_t* token_emb, const float* pos_emb, const int* token_id,
+    float* output, int* real_seq_len, int padding_id, int pos_offset,
+    float dequant_scale);
+
+template void ker_gpt_embedding_i8I_launcher<__half>(
+    int batch_size, int batch_seq_len, int hidden_size, cudaStream_t stream,
+    const int8_t* token_emb, const __half* pos_emb, const int* token_id,
+    __half* output, int* real_seq_len, int padding_id, int pos_offset,
+    float dequant_scale);
+
+__global__ void ker_ppl_i8I(const int8_t* logits, const int* input_ids,
+                            const int* real_seq_len, float* ppl, int vocab_size,
+                            float dequant_scale, bool in_col32) {
+  int seq_len = real_seq_len[blockIdx.x];  // remove "eos"
+  if (blockIdx.y >= seq_len - 1) {
+    // will not contribute to ppl
+    return;
+  }
+
+  int token_idx_in_batch = blockIdx.x * gridDim.y + blockIdx.y;
+  int left_logit_idx = token_idx_in_batch * vocab_size + threadIdx.x;
+  int right_logit_idx = (token_idx_in_batch + 1) * vocab_size;
+  /*
+  step 1. find max logit over the whole vocab
+  */
+  float max_logit = CUDA_FLOAT_INF_NEG;
+  for (int idx = left_logit_idx; idx < right_logit_idx; idx += blockDim.x) {
+    int logits_idx;
+    if (in_col32) {
+      int row_id = token_idx_in_batch;
+      int col_id = idx - token_idx_in_batch * vocab_size;
+      logits_idx = row_major2flat_col32(row_id, col_id, gridDim.x * gridDim.y,
+                                        vocab_size);
+    } else {
+      logits_idx = idx;
+    }
+    max_logit = fmaxf(max_logit, (float)logits[logits_idx] * dequant_scale);
+  }
+  max_logit = blockReduceMax(max_logit);
+  __shared__ float s_max_logit;
+  if (threadIdx.x == 0) {
+    s_max_logit = max_logit;
+  }
+  __syncthreads();
+
+  /*
+  step 2. compute the log probability for the given token,
+  add it to the sequence's ppl
+  */
+  float sum_exp_logit = 0.f;
+  for (int idx = left_logit_idx; idx < right_logit_idx; idx += blockDim.x) {
+    int logits_idx;
+    if (in_col32) {
+      int row_id = token_idx_in_batch;
+      int col_id = idx - token_idx_in_batch * vocab_size;
+      logits_idx = row_major2flat_col32(row_id, col_id, gridDim.x * gridDim.y,
+                                        vocab_size);
+    } else {
+      logits_idx = idx;
+    }
+    float lgt = fmaxf((float)logits[logits_idx] * dequant_scale - s_max_logit,
+                      logit_thresh_min);
+    sum_exp_logit += expf(lgt);
+  }
+  sum_exp_logit = blockReduceSum(sum_exp_logit);
+
+  if (threadIdx.x == 0) {
+    int token_id = input_ids[token_idx_in_batch + 1];
+    int logits_idx;
+    if (in_col32) {
+      int row_id = token_idx_in_batch;
+      int col_id = token_id;
+      logits_idx = row_major2flat_col32(row_id, col_id, gridDim.x * gridDim.y,
+                                        vocab_size);
+    } else {
+      logits_idx = token_idx_in_batch * vocab_size + token_id;
+    }
+    float log_prob = ((float)logits[logits_idx] * dequant_scale - s_max_logit -
+                      logf(sum_exp_logit)) /
+                     (float)(seq_len - 1);
+    atomicAdd(ppl + blockIdx.x, -log_prob);
+  }
+}
+
+void ker_ppl_i8I_launcher(int batch_size, int batch_seq_len,
+                          int max_thread_per_block, cudaStream_t stream,
+                          const int8_t* logits, const int* input_ids,
+                          const int* real_seq_len, float* ppl, int vocab_size,
+                          float dequant_scale, bool in_col32) {
+  ker_ppl_i8I<<<dim3(batch_size, batch_seq_len), max_thread_per_block, 0,
+                stream>>>(logits, input_ids, real_seq_len, ppl, vocab_size,
+                          dequant_scale, in_col32);
+}
+
+template <typename T>
+__global__ void ker_correlation_softmax_gpt_i32I(
+    int32_t* correlation, T* output, const int* real_seq_len,
+    const int batch_seq_len, float attn_scale, float dequant_scale) {
+  int query_token_pos = blockIdx.y % batch_seq_len;
+  if (query_token_pos >= real_seq_len[blockIdx.x]) {
+    return;
+  }
+
+  int mask = 0;  // can see the token when mask=0
+  if (threadIdx.x > query_token_pos || threadIdx.x >= batch_seq_len) {
+    mask = 1;  // Can only see the token on the left side of it
+  }
+
+  int idx = (blockIdx.x * gridDim.y + blockIdx.y) * batch_seq_len + threadIdx.x;
+  float val = threadIdx.x < batch_seq_len
+                  ? ((float)correlation[idx] * attn_scale * dequant_scale *
+                     dequant_scale)
+                  : CUDA_FLOAT_INF_NEG;
+  float max_val = blockReduceMax<float>(mask ? CUDA_FLOAT_INF_NEG : val);
+  __shared__ float smax;
+  if (threadIdx.x == 0) smax = max_val;
+  __syncthreads();
+
+  val = mask ? 0.f : expf(val - smax);
+  float rsum = blockReduceSum<float>(val);
+  __shared__ float ssum;
+  if (threadIdx.x == 0) ssum = rsum;
+  __syncthreads();
+
+  if (threadIdx.x < batch_seq_len) output[idx] = (T)(val / ssum);
+}
+
+template <typename T>
+void ker_correlation_softmax_gpt_i32I_launcher(
+    int batch_size, int batch_seq_len, int head_num, cudaStream_t stream,
+    int32_t* correlation, T* output, const int* real_seq_len, float attn_scale,
+    float dequant_scale) {
+  int block_dim = batch_seq_len;
+  if (batch_seq_len < 1024) {
+    block_dim = (batch_seq_len + 31) >> 5;
+    block_dim *= 32;
+  }
+
+  ker_correlation_softmax_gpt_i32I<T>
+      <<<dim3(batch_size, head_num * batch_seq_len), block_dim, 0, stream>>>(
+          correlation, output, real_seq_len, batch_seq_len, attn_scale,
+          dequant_scale);
+}
+
+template void ker_correlation_softmax_gpt_i32I_launcher<float>(
+    int batch_size, int batch_seq_len, int head_num, cudaStream_t stream,
+    int32_t* correlation, float* output, const int* real_seq_len,
+    float attn_scale, float dequant_scale);
+
+template void ker_correlation_softmax_gpt_i32I_launcher<__half>(
+    int batch_size, int batch_seq_len, int head_num, cudaStream_t stream,
+    int32_t* correlation, __half* output, const int* real_seq_len,
+    float attn_scale, float dequant_scale);
+
+template <int k>
+__global__ void ker_topk_sample_i8I(const int8_t* logits, int* old_input_ids,
+                                    int* new_input_ids, const int* real_seq_len,
+                                    const int vocab_size,
+                                    const int batch_seq_len, int logits_seq_len,
+                                    int* unfinished, curandState* curandstate,
+                                    int eos_id, float dequant_scale,
+                                    bool in_col32) {
+  int last_token_idx_in_batch = blockIdx.x * batch_seq_len + batch_seq_len - 1;
+
+  /* add EOS to end if last token is EOS */
+  if (old_input_ids[last_token_idx_in_batch] == eos_id) {
+    int left_token_idx = blockIdx.x * batch_seq_len + threadIdx.x;
+    int right_token_idx = (blockIdx.x + 1) * batch_seq_len;
+    for (int idx = left_token_idx; idx < right_token_idx; idx += blockDim.x) {
+      int new_idx = idx + blockIdx.x;
+      new_input_ids[new_idx] = old_input_ids[idx];
+    }
+    if (threadIdx.x == 0) {
+      // blockIdx.x * (batch_seq_len+1) + batch_seq_len
+      new_input_ids[(blockIdx.x + 1) * (batch_seq_len + 1) - 1] = eos_id;
+      old_input_ids[gridDim.x * batch_seq_len + blockIdx.x] = eos_id;
+    }
+    return;
+  }
+  int logits_token_idx_in_batch =
+      blockIdx.x * logits_seq_len + logits_seq_len - 1;
+  int left_logit_idx = logits_token_idx_in_batch * vocab_size + threadIdx.x;
+  int right_logit_idx = (logits_token_idx_in_batch + 1) * vocab_size;
+
+  /*
+  step1. find max logit and rough Kth logit over the whole vocab
+  */
+  __shared__ float s_max_logit, s_topk_logit;
+  float rough_top_kth_logit = CUDA_FLOAT_INF_NEG;
+  for (int idx = left_logit_idx; idx < right_logit_idx; idx += blockDim.x) {
+    int logits_idx;
+    if (in_col32) {
+      int row_id = logits_token_idx_in_batch;
+      int col_id = idx - logits_token_idx_in_batch * vocab_size;
+      logits_idx = row_major2flat_col32(row_id, col_id,
+                                        gridDim.x * logits_seq_len, vocab_size);
+    } else {
+      logits_idx = idx;
+    }
+    rough_top_kth_logit =
+        fmaxf(rough_top_kth_logit, (float)logits[logits_idx] * dequant_scale);
+  }
+  float max_logit = blockReduceMax(rough_top_kth_logit);
+  rough_top_kth_logit = blockRoughTopK<float, k>(rough_top_kth_logit);
+  if (threadIdx.x == 0) {
+    s_topk_logit = rough_top_kth_logit;
+    s_max_logit = max_logit;
+  }
+  __syncthreads();
+
+  __shared__ int s_tid;
+
+  if (k != 1) {
+    /* step2 hold one logit per thread which larger than Kth logit and sample
+     * from them */
+    float topk_exp_sum, topk_exp = CUDA_FLOAT_INF_NEG;
+    int topk_tid = vocab_size;
+    int test_num = 0;
+    __shared__ float s_topk_exp_sum;
+    for (int idx = left_logit_idx; idx < right_logit_idx; idx += blockDim.x) {
+      int logits_idx;
+      if (in_col32) {
+        int row_id = logits_token_idx_in_batch;
+        int col_id = idx - logits_token_idx_in_batch * vocab_size;
+        logits_idx = row_major2flat_col32(
+            row_id, col_id, gridDim.x * logits_seq_len, vocab_size);
+      } else {
+        logits_idx = idx;
+      }
+      float logit = (float)logits[logits_idx] * dequant_scale;
+      float logit_exp = expf(fmaxf(logit - s_max_logit, logit_thresh_min));
+      if (logit >= s_topk_logit) test_num++;
+      if (logit >= s_topk_logit && logit_exp > topk_exp) {
+        topk_exp = logit_exp;
+        topk_tid = idx - left_logit_idx + threadIdx.x;
+      }
+    }
+
+    test_num = blockReduceSum(test_num);
+
+    if (topk_tid == vocab_size) topk_exp = 0;
+    topk_exp_sum = blockReduceSum(topk_exp);
+    if (threadIdx.x == 0) {
+      s_topk_exp_sum = topk_exp_sum;
+    }
+    __syncthreads();
+
+    /* calculate cumulative probability */
+    float topk_prob = topk_exp / s_topk_exp_sum;
+    float prefix_sum_prob;
+    typedef cub::BlockScan<float, 1024> BlockScan;
+    __shared__ typename BlockScan::TempStorage temp_storage;
+    BlockScan(temp_storage).InclusiveSum(topk_prob, prefix_sum_prob);
+
+    __shared__ float random_x;
+    if (threadIdx.x == 0) {
+      random_x = curand_uniform(curandstate + blockIdx.x);
+    }
+    __syncthreads();
+
+    if (threadIdx.x == 0) {
+      s_tid = vocab_size;
+    }
+    __syncthreads();
+
+    int threadID = threadIdx.x;
+    __shared__ int s_threadID;
+    __shared__ float s_max_prob;
+    if (random_x > prefix_sum_prob) threadID = blockDim.x;
+    threadID = blockReduceMin(threadID);
+    float max_prob = blockReduceMax(topk_prob);
+    if (threadIdx.x == 0) {
+      s_threadID = threadID;
+      s_max_prob = max_prob;
+    }
+    __syncthreads();
+    if (threadIdx.x == s_threadID) {
+      s_tid = topk_tid;
+    }
+    __syncthreads();
+
+    if (s_tid == vocab_size && topk_prob == s_max_prob) {
+      s_tid = topk_tid;
+    }
+    __syncthreads();
+  } else {
+    s_tid = vocab_size;
+    for (int idx = left_logit_idx; idx < right_logit_idx; idx += blockDim.x) {
+      int logits_idx;
+      if (in_col32) {
+        int row_id = logits_token_idx_in_batch;
+        int col_id = idx - logits_token_idx_in_batch * vocab_size;
+        logits_idx = row_major2flat_col32(
+            row_id, col_id, gridDim.x * logits_seq_len, vocab_size);
+      } else {
+        logits_idx = idx;
+      }
+      float logit = (float)logits[logits_idx] * dequant_scale;
+      if (logit == s_max_logit) {
+        s_tid = idx - left_logit_idx + threadIdx.x;
+      }
+    }
+    __syncthreads();
+  }
+
+  /* if new sampled tid is not EOS, set unfinish TRUE */
+  if (threadIdx.x == 0) {
+    if (s_tid != eos_id) unfinished[0] = 1;
+  }
+
+  /* step3 copy old_input_ids to new_input_ids and add new sampled ids */
+  int left_token_idx = blockIdx.x * batch_seq_len + threadIdx.x;
+  int right_token_idx = (blockIdx.x + 1) * batch_seq_len;
+  for (int idx = left_token_idx; idx < right_token_idx; idx += blockDim.x) {
+    int new_idx = idx + blockIdx.x;
+    new_input_ids[new_idx] = old_input_ids[idx];
+  }
+  if (threadIdx.x == 0) {
+    new_input_ids[(blockIdx.x + 1) * (batch_seq_len + 1) - 1] = s_tid;
+    //  save the newly sampled ids to old_input_ids for next step inputs
+    old_input_ids[gridDim.x * batch_seq_len + blockIdx.x] = s_tid;
+  }
+}
+
+void ker_topk_sample_i8I_launcher(int batch_size, int batch_seq_len,
+                                  int logits_seq_len, int max_thread_per_block,
+                                  cudaStream_t stream, const int8_t* logits,
+                                  int* old_input_ids, int* new_input_ids,
+                                  const int* real_seq_len, const int vocab_size,
+                                  const int k, int* unfinished,
+                                  curandState* curandstate, int eos_id,
+                                  float dequant_scale, bool in_col32) {
+  if (k == 1)
+    ker_topk_sample_i8I<1><<<batch_size, max_thread_per_block, 0, stream>>>(
+        logits, old_input_ids, new_input_ids, real_seq_len, vocab_size,
+        batch_seq_len, logits_seq_len, unfinished, curandstate, eos_id,
+        dequant_scale, in_col32);
+  else if (k == 2)
+    ker_topk_sample_i8I<2><<<batch_size, max_thread_per_block, 0, stream>>>(
+        logits, old_input_ids, new_input_ids, real_seq_len, vocab_size,
+        batch_seq_len, logits_seq_len, unfinished, curandstate, eos_id,
+        dequant_scale, in_col32);
+  else if (k == 4)
+    ker_topk_sample_i8I<4><<<batch_size, max_thread_per_block, 0, stream>>>(
+        logits, old_input_ids, new_input_ids, real_seq_len, vocab_size,
+        batch_seq_len, logits_seq_len, unfinished, curandstate, eos_id,
+        dequant_scale, in_col32);
+  else if (k == 8)
+    ker_topk_sample_i8I<8><<<batch_size, max_thread_per_block, 0, stream>>>(
+        logits, old_input_ids, new_input_ids, real_seq_len, vocab_size,
+        batch_seq_len, logits_seq_len, unfinished, curandstate, eos_id,
+        dequant_scale, in_col32);
+  else if (k == 16)
+    ker_topk_sample_i8I<16><<<batch_size, max_thread_per_block, 0, stream>>>(
+        logits, old_input_ids, new_input_ids, real_seq_len, vocab_size,
+        batch_seq_len, logits_seq_len, unfinished, curandstate, eos_id,
+        dequant_scale, in_col32);
+  else if (k == 32)
+    ker_topk_sample_i8I<32><<<batch_size, max_thread_per_block, 0, stream>>>(
+        logits, old_input_ids, new_input_ids, real_seq_len, vocab_size,
+        batch_seq_len, logits_seq_len, unfinished, curandstate, eos_id,
+        dequant_scale, in_col32);
+  else {
+    throw std::invalid_argument("topk argument should be in [1,2,4,8,16,32]");
+  }
+}
+
+__global__ void ker_topp_sample_i8I(const int8_t* logits, int* old_input_ids,
+                                    int* new_input_ids, const int* real_seq_len,
+                                    const int vocab_size,
+                                    const int batch_seq_len, int logits_seq_len,
+                                    int* unfinished, float p,
+                                    curandState* curandstate, int eos_id,
+                                    float dequant_scale, bool in_col32) {
+  int token_idx_in_batch = blockIdx.x * batch_seq_len + batch_seq_len - 1;
+
+  /* add EOS to end if last token is EOS */
+  if (old_input_ids[token_idx_in_batch] == eos_id) {
+    int left_token_idx = blockIdx.x * batch_seq_len + threadIdx.x;
+    int right_token_idx = (blockIdx.x + 1) * batch_seq_len;
+    for (int idx = left_token_idx; idx < right_token_idx; idx += blockDim.x) {
+      int new_idx = idx + blockIdx.x;
+      new_input_ids[new_idx] = old_input_ids[idx];
+    }
+    if (threadIdx.x == 0) {
+      new_input_ids[(blockIdx.x + 1) * (batch_seq_len + 1) - 1] = eos_id;
+      old_input_ids[gridDim.x * batch_seq_len + blockIdx.x] = eos_id;
+    }
+    return;
+  }
+  int logits_token_idx_in_batch =
+      blockIdx.x * logits_seq_len + logits_seq_len - 1;
+  int left_logit_idx = logits_token_idx_in_batch * vocab_size + threadIdx.x;
+  int right_logit_idx = (logits_token_idx_in_batch + 1) * vocab_size;
+
+  /*
+  step1. find max logit in each thread and sample from these probs with nucleus
+  sampling
+  */
+  __shared__ float s_max_logit;
+  float max_logit = CUDA_FLOAT_INF_NEG;
+  for (int idx = left_logit_idx; idx < right_logit_idx; idx += blockDim.x) {
+    int logits_idx;
+    if (in_col32) {
+      int row_id = logits_token_idx_in_batch;
+      int col_id = idx - logits_token_idx_in_batch * vocab_size;
+      logits_idx = row_major2flat_col32(row_id, col_id,
+                                        gridDim.x * logits_seq_len, vocab_size);
+    } else {
+      logits_idx = idx;
+    }
+    max_logit = fmaxf(max_logit, (float)logits[logits_idx] * dequant_scale);
+  }
+  float max_logit_array[1];
+  max_logit_array[0] = max_logit;
+  typedef cub::BlockRadixSort<float, 1024, 1> BlockRadixSort;
+  __shared__ typename BlockRadixSort::TempStorage sort_temp_storage;
+  BlockRadixSort(sort_temp_storage).SortDescending(max_logit_array);
+  float presum_max_logit_exp;
+  max_logit = max_logit_array[0];
+
+  float block_max_logit = blockReduceMax(max_logit);
+  if (threadIdx.x == 0) {
+    s_max_logit = block_max_logit;
+  }
+  __syncthreads();
+
+  float biased_logit_exp =
+      expf(fmaxf(max_logit - s_max_logit, logit_thresh_min));
+
+  typedef cub::BlockScan<float, 1024> BlockScan;
+  __shared__ typename BlockScan::TempStorage presum_temp_storage;
+  BlockScan(presum_temp_storage)
+      .InclusiveSum(biased_logit_exp, presum_max_logit_exp);
+
+  float topp_exp_threshold;
+  if (threadIdx.x == blockDim.x - 1) {
+    topp_exp_threshold = p * presum_max_logit_exp;
+  }
+  __shared__ float s_presum_logit_exp_threshold;
+  if (presum_max_logit_exp > topp_exp_threshold) {
+    presum_max_logit_exp = CUDA_FLOAT_INF_NEG;
+  }
+  float logit_exp_threshold = blockReduceMax(presum_max_logit_exp);
+  if (threadIdx.x == 0) {
+    s_presum_logit_exp_threshold = logit_exp_threshold;
+  }
+  __syncthreads();
+
+  __shared__ float s_logit_threshold;
+  if (presum_max_logit_exp == s_presum_logit_exp_threshold) {
+    s_logit_threshold = max_logit;
+  }
+  __syncthreads();
+
+  /* step2 hold one logit per thread and sample
+   * from them */
+  float topk_exp_sum, topk_exp = CUDA_FLOAT_INF_NEG;
+  int topk_tid = vocab_size;
+  int test_num = 0;
+  __shared__ float s_topk_exp_sum;
+  for (int idx = left_logit_idx; idx < right_logit_idx; idx += blockDim.x) {
+    int logits_idx;
+    if (in_col32) {
+      int row_id = logits_token_idx_in_batch;
+      int col_id = idx - logits_token_idx_in_batch * vocab_size;
+      logits_idx = row_major2flat_col32(row_id, col_id,
+                                        gridDim.x * logits_seq_len, vocab_size);
+    } else {
+      logits_idx = idx;
+    }
+    float logit = (float)logits[logits_idx] * dequant_scale;
+    float logit_exp = expf(fmaxf(logit - s_max_logit, logit_thresh_min));
+    if (logit >= s_logit_threshold) test_num++;
+    if (logit >= s_logit_threshold && logit_exp > topk_exp) {
+      topk_exp = logit_exp;
+      topk_tid = idx - left_logit_idx + threadIdx.x;
+    }
+  }
+
+  test_num = blockReduceSum(test_num);
+
+  if (topk_tid == vocab_size) topk_exp = 0;
+  topk_exp_sum = blockReduceSum(topk_exp);
+  if (threadIdx.x == 0) {
+    s_topk_exp_sum = topk_exp_sum;
+  }
+  __syncthreads();
+
+  /* calculate cumulative probability */
+  float topk_prob = topk_exp / s_topk_exp_sum;
+  float prefix_sum_prob;
+  BlockScan(presum_temp_storage).InclusiveSum(topk_prob, prefix_sum_prob);
+
+  __shared__ float random_x;
+  if (threadIdx.x == 0) {
+    random_x = curand_uniform(curandstate + blockIdx.x);
+  }
+  __syncthreads();
+
+  __shared__ int s_tid;
+  if (threadIdx.x == 0) {
+    s_tid = vocab_size;
+  }
+  __syncthreads();
+
+  int threadID = threadIdx.x;
+  __shared__ int s_threadID;
+  __shared__ float s_max_prob;
+  if (random_x > prefix_sum_prob) threadID = blockDim.x;
+  threadID = blockReduceMin(threadID);
+  float max_prob = blockReduceMax(topk_prob);
+  if (threadIdx.x == 0) {
+    s_threadID = threadID;
+    s_max_prob = max_prob;
+  }
+  __syncthreads();
+  if (threadIdx.x == s_threadID) {
+    s_tid = topk_tid;
+  }
+  __syncthreads();
+
+  if (s_tid == vocab_size && topk_prob == s_max_prob) {
+    s_tid = topk_tid;
+  }
+  __syncthreads();
+
+  /* if new sampled tid is not EOS, set unfinish TRUE */
+  if (threadIdx.x == 0) {
+    if (s_tid != eos_id) unfinished[0] = 1;
+  }
+
+  /* step3 copy old_input_ids to new_input_ids and add new sampled ids */
+  int left_token_idx = blockIdx.x * batch_seq_len + threadIdx.x;
+  int right_token_idx = (blockIdx.x + 1) * batch_seq_len;
+  for (int idx = left_token_idx; idx < right_token_idx; idx += blockDim.x) {
+    int new_idx = idx + blockIdx.x;
+    new_input_ids[new_idx] = old_input_ids[idx];
+  }
+  if (threadIdx.x == 0) {
+    new_input_ids[(blockIdx.x + 1) * (batch_seq_len + 1) - 1] = s_tid;
+    //  save the newly sampled ids to old_input_ids for next step inputs
+    old_input_ids[gridDim.x * batch_seq_len + blockIdx.x] = s_tid;
+  }
+}
+
+void ker_topp_sample_i8I_launcher(int batch_size, int batch_seq_len,
+                                  int logits_seq_len, int max_thread_per_block,
+                                  cudaStream_t stream, const int8_t* logits,
+                                  int* old_input_ids, int* new_input_ids,
+                                  const int* real_seq_len, const int vocab_size,
+                                  const float p, int* unfinished,
+                                  curandState* curandstate, int eos_id,
+                                  float dequant_scale, bool in_col32) {
+  ker_topp_sample_i8I<<<batch_size, max_thread_per_block, 0, stream>>>(
+      logits, old_input_ids, new_input_ids, real_seq_len, vocab_size,
+      batch_seq_len, logits_seq_len, unfinished, p, curandstate, eos_id,
+      dequant_scale, in_col32);
+}
+
+template <typename T>
+__global__ void ker_arrange_qkv_with_cache_i8I_i8O(
+    const int8_t* ori_qkv, const T* qkv_bias, int8_t* new_q, int8_t* new_k,
+    int8_t* k_cache, int8_t* new_v, int8_t* v_cache, int batch_seq_len,
+    int dim_per_head, int head_num, float dequant_scale, float quant_scale,
+    bool in_col32) {
+  int hidden_size = head_num * dim_per_head;
+  int batch_size = gridDim.x / batch_seq_len;
+  int batch_id = blockIdx.x / batch_seq_len;
+  int token_id = blockIdx.x % batch_seq_len;
+  int head_id = threadIdx.x / dim_per_head;
+  int dim_id = threadIdx.x % dim_per_head;
+  int target_id = targetid_4dim(batch_id, head_id, token_id, dim_id, head_num,
+                                batch_seq_len, dim_per_head);
+  int8_t new_val;
+
+  if (token_id < batch_seq_len - 1) {
+    int old_target_id =
+        targetid_4dim(batch_id, head_id, token_id, dim_id, head_num,
+                      batch_seq_len - 1, dim_per_head);
+    if (blockIdx.y == 0) return;
+    if (blockIdx.y == 1) new_val = k_cache[old_target_id];
+    if (blockIdx.y == 2) new_val = v_cache[old_target_id];
+  } else {
+    int qkv_index;
+    if (in_col32) {
+      int row_id = batch_id;
+      int col_id = blockIdx.y * hidden_size + threadIdx.x;
+      qkv_index = row_major2flat_col32(row_id, col_id, batch_size,
+                                       gridDim.y * hidden_size);
+    } else {
+      qkv_index =
+          (batch_id * gridDim.y + blockIdx.y) * hidden_size + threadIdx.x;
+    }
+    float tmp_val = float(ori_qkv[qkv_index]) * dequant_scale +
+                    __ldg(&qkv_bias[blockIdx.y * hidden_size + threadIdx.x]);
+    new_val = float2int8(tmp_val, quant_scale);
+    if (blockIdx.y == 0) {
+      target_id = targetid_4dim(batch_id, head_id, 0, dim_id, head_num, 1,
+                                dim_per_head);
+    }
+  }
+
+  if (blockIdx.y == 0) new_q[target_id] = new_val;
+  if (blockIdx.y == 1) new_k[target_id] = new_val;
+  if (blockIdx.y == 2) {
+    new_v[target_id] = new_val;
+  }
+}
+
+template <>
+__global__ void ker_arrange_qkv_with_cache_i8I_i8O<__half>(
+    const int8_t* ori_qkv, const __half* qkv_bias, int8_t* new_q, int8_t* new_k,
+    int8_t* k_cache, int8_t* new_v, int8_t* v_cache, int batch_seq_len,
+    int dim_per_head, int head_num, float dequant_scale, float quant_scale,
+    bool in_col32) {
+  int hidden_size = head_num * dim_per_head;
+  int batch_size = gridDim.x / batch_seq_len;
+  int batch_id = blockIdx.x / batch_seq_len;
+  int token_id = blockIdx.x % batch_seq_len;
+  int head_id = threadIdx.x / dim_per_head;
+  int dim_id = threadIdx.x % dim_per_head;
+  int target_id = targetid_4dim(batch_id, head_id, token_id, dim_id, head_num,
+                                batch_seq_len, dim_per_head);
+  int2 new_val;
+  int8_t* p_new_val = (int8_t*)(&new_val);
+  const int2* p_ori_qkv = (const int2*)ori_qkv;
+  const float4* p_bias = (const float4*)qkv_bias;
+  const int2* p_k_cache = (const int2*)k_cache;
+  const int2* p_v_cache = (const int2*)v_cache;
+  int2* p_new_q = (int2*)new_q;
+  int2* p_new_k = (int2*)new_k;
+  int2* p_new_v = (int2*)new_v;
+
+  if (token_id < batch_seq_len - 1) {
+    int old_target_id =
+        targetid_4dim(batch_id, head_id, token_id, dim_id, head_num,
+                      batch_seq_len - 1, dim_per_head);
+    if (blockIdx.y == 0) return;
+    if (blockIdx.y == 1) new_val = p_k_cache[old_target_id];
+    if (blockIdx.y == 2) new_val = p_v_cache[old_target_id];
+  } else {
+    int qkv_index;
+    if (in_col32) {
+      int row_id = batch_id;
+      int col_id = (blockIdx.y * hidden_size + threadIdx.x) << 3;
+      qkv_index = row_major2flat_col32(row_id, col_id, batch_size,
+                                       (gridDim.y * hidden_size) << 3) >>
+                  3;
+    } else {
+      qkv_index =
+          (batch_id * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+    }
+    int2 ori_qkv8 = p_ori_qkv[qkv_index];
+    float4 bias8 = __ldg(&p_bias[blockIdx.y * blockDim.x + threadIdx.x]);
+    int8_t* p_ori_qkv8 = (int8_t*)(&ori_qkv8);
+    __half* p_bias8 = (__half*)(&bias8);
+#pragma unroll
+    for (int i = 0; i < 8; ++i) {
+      p_new_val[i] =
+          float2int8(float(p_ori_qkv8[i]) * dequant_scale + float(p_bias8[i]),
+                     quant_scale);
+    }
+    if (blockIdx.y == 0) {
+      target_id = targetid_4dim(batch_id, head_id, 0, dim_id, head_num, 1,
+                                dim_per_head);
+    }
+  }
+
+  if (blockIdx.y == 0) p_new_q[target_id] = new_val;
+  if (blockIdx.y == 1) p_new_k[target_id] = new_val;
+  if (blockIdx.y == 2) p_new_v[target_id] = new_val;
+}
+
+template <typename T>
+void ker_arrange_qkv_with_cache_i8I_i8O_launcher(
+    int batch_token_num, int hidden_size, cudaStream_t stream,
+    const int8_t* ori_qkv, const T* qkv_bias, int8_t* new_q, int8_t* new_k,
+    int8_t* k_cache, int8_t* new_v, int8_t* v_cache, int batch_seq_len,
+    int dim_per_head, int head_num, float dequant_scale, float quant_scale,
+    bool in_col32) {
+  ker_arrange_qkv_with_cache_i8I_i8O<T>
+      <<<dim3(batch_token_num, 3), hidden_size, 0, stream>>>(
+          ori_qkv, qkv_bias, new_q, new_k, k_cache, new_v, v_cache,
+          batch_seq_len, dim_per_head, head_num, dequant_scale, quant_scale,
+          in_col32);
+}
+
+template <>
+void ker_arrange_qkv_with_cache_i8I_i8O_launcher<__half>(
+    int batch_token_num, int hidden_size, cudaStream_t stream,
+    const int8_t* ori_qkv, const __half* qkv_bias, int8_t* new_q, int8_t* new_k,
+    int8_t* k_cache, int8_t* new_v, int8_t* v_cache, int batch_seq_len,
+    int dim_per_head, int head_num, float dequant_scale, float quant_scale,
+    bool in_col32) {
+  ker_arrange_qkv_with_cache_i8I_i8O<__half>
+      <<<dim3(batch_token_num, 3), hidden_size / 8, 0, stream>>>(
+          ori_qkv, qkv_bias, new_q, new_k, k_cache, new_v, v_cache,
+          batch_seq_len, dim_per_head / 8, head_num, dequant_scale, quant_scale,
+          in_col32);
+}
+
+template void ker_arrange_qkv_with_cache_i8I_i8O_launcher<float>(
+    int batch_token_num, int hidden_size, cudaStream_t stream,
+    const int8_t* ori_qkv, const float* qkv_bias, int8_t* new_q, int8_t* new_k,
+    int8_t* k_cache, int8_t* new_v, int8_t* v_cache, int batch_seq_len,
+    int dim_per_head, int head_num, float dequant_scale, float quant_scale,
+    bool in_col32);
+
+template void ker_arrange_qkv_with_cache_i8I_i8O_launcher<__half>(
+    int batch_token_num, int hidden_size, cudaStream_t stream,
+    const int8_t* ori_qkv, const __half* qkv_bias, int8_t* new_q, int8_t* new_k,
+    int8_t* k_cache, int8_t* new_v, int8_t* v_cache, int batch_seq_len,
+    int dim_per_head, int head_num, float dequant_scale, float quant_scale,
+    bool in_col32);
+
+template <typename T>
+__global__ void ker_attention_mask_weights_i32I(
+    int32_t* correlation, T* output, const int* real_seq_len, int dst_seq_len,
+    int src_seq_len, float attn_scale, float dequant_scale) {
+  int query_token_pos = blockIdx.y % dst_seq_len + src_seq_len - dst_seq_len;
+  if (query_token_pos >= real_seq_len[blockIdx.x]) {
+    return;
+  }
+  int mask = 0;  // can see the token when mask=0
+  if (threadIdx.x > query_token_pos) {
+    mask = 1;  // Can only see the token on the left side of it
+  }
+
+  int idx = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  float val =
+      (float)correlation[idx] * attn_scale * dequant_scale * dequant_scale;
+  float max_val = blockReduceMax<float>(mask ? CUDA_FLOAT_INF_NEG : val);
+  __shared__ float smax;
+  if (threadIdx.x == 0) smax = max_val;
+  __syncthreads();
+
+  val = mask ? 0.f : expf(fmaxf(logit_thresh_min, val - smax));
+  float rsum = blockReduceSum<float>(val);
+  __shared__ float ssum;
+  if (threadIdx.x == 0) ssum = rsum;
+  __syncthreads();
+
+  output[idx] = (T)(val / (ssum + epsilon));
+}
+
+template <typename T>
+void ker_attention_mask_weights_i32I_launcher(
+    int batch_size, int dst_seq_len, int src_seq_len, int head_num,
+    cudaStream_t stream, int32_t* correlation, T* output,
+    const int* real_seq_len, float attn_scale, float dequant_scale) {
+  ker_attention_mask_weights_i32I<T>
+      <<<dim3(batch_size, head_num * dst_seq_len), src_seq_len, 0, stream>>>(
+          correlation, output, real_seq_len, dst_seq_len, src_seq_len,
+          attn_scale, dequant_scale);
+}
+
+template void ker_attention_mask_weights_i32I_launcher<float>(
+    int batch_size, int dst_seq_len, int src_seq_len, int head_num,
+    cudaStream_t stream, int32_t* correlation, float* output,
+    const int* real_seq_len, float attn_scale, float dequant_scale);
+
+template void ker_attention_mask_weights_i32I_launcher<__half>(
+    int batch_size, int dst_seq_len, int src_seq_len, int head_num,
+    cudaStream_t stream, int32_t* correlation, __half* output,
+    const int* real_seq_len, float attn_scale, float dequant_scale);
+
+}  // namespace cuda
+}  // namespace lightseq
diff --git a/lightseq/inference/kernels/gptKernels_int8.h b/lightseq/inference/kernels/gptKernels_int8.h
new file mode 100644
index 00000000..007e8e9a
--- /dev/null
+++ b/lightseq/inference/kernels/gptKernels_int8.h
@@ -0,0 +1,63 @@
+#pragma once
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <curand_kernel.h>
+#include <cub/cub.cuh>
+
+namespace lightseq {
+namespace cuda {
+
+template <typename T>
+void ker_gpt_embedding_i8I_launcher(int batch_size, int batch_seq_len,
+                                    int hidden_size, cudaStream_t stream,
+                                    const int8_t* token_emb, const T* pos_emb,
+                                    const int* token_id, T* output,
+                                    int* real_seq_len, int padding_id,
+                                    int pos_offset, float dequant_scale);
+
+void ker_ppl_i8I_launcher(int batch_size, int batch_seq_len,
+                          int max_thread_per_block, cudaStream_t stream,
+                          const int8_t* logits, const int* input_ids,
+                          const int* real_seq_len, float* ppl, int vocab_size,
+                          float dequant_scale, bool in_col32 = false);
+
+template <typename T>
+void ker_correlation_softmax_gpt_i32I_launcher(
+    int batch_size, int batch_seq_len, int head_num, cudaStream_t stream,
+    int32_t* correlation, T* output, const int* real_seq_len, float attn_scale,
+    float dequant_scale);
+
+void ker_topk_sample_i8I_launcher(int batch_size, int batch_seq_len,
+                                  int logits_seq_len, int max_thread_per_block,
+                                  cudaStream_t stream, const int8_t* logits,
+                                  int* old_input_ids, int* new_input_ids,
+                                  const int* real_seq_len, const int vocab_size,
+                                  const int k, int* all_finished,
+                                  curandState* curandstate, int eos_id,
+                                  float dequant_scale, bool in_col32 = false);
+
+void ker_topp_sample_i8I_launcher(int batch_size, int batch_seq_len,
+                                  int logits_seq_len, int max_thread_per_block,
+                                  cudaStream_t stream, const int8_t* logits,
+                                  int* old_input_ids, int* new_input_ids,
+                                  const int* real_seq_len, const int vocab_size,
+                                  const float p, int* unfinished,
+                                  curandState* curandstate, int eos_id,
+                                  float dequant_scale, bool in_col32 = false);
+
+template <typename T>
+void ker_arrange_qkv_with_cache_i8I_i8O_launcher(
+    int batch_token_num, int hidden_size, cudaStream_t stream,
+    const int8_t* ori_qkv, const T* qkv_bias, int8_t* new_q, int8_t* new_k,
+    int8_t* k_cache, int8_t* new_v, int8_t* v_cache, int batch_seq_len,
+    int dim_per_head, int head_num, float dequant_scale, float quant_scale,
+    bool in_col32 = false);
+
+template <typename T>
+void ker_attention_mask_weights_i32I_launcher(
+    int batch_size, int dst_seq_len, int src_seq_len, int head_num,
+    cudaStream_t stream, int32_t* correlation, T* output,
+    const int* real_seq_len, float attn_scale, float dequant_scale);
+
+}  // namespace cuda
+}  // namespace lightseq
diff --git a/lightseq/inference/kernels/transformerKernels.cc.cu b/lightseq/inference/kernels/transformerKernels.cc.cu
index c8794312..05a22094 100644
--- a/lightseq/inference/kernels/transformerKernels.cc.cu
+++ b/lightseq/inference/kernels/transformerKernels.cc.cu
@@ -810,7 +810,7 @@ __global__ void ker_arrange_decself_qkv(const T* ori_qkv, const T* qkv_bias,
     T val = ori_qkv[(blockIdx.x * gridDim.y + blockIdx.y) * hidden_size + i] +
             __ldg(&qkv_bias[blockIdx.y * hidden_size + i]);
     int seq_id =
-        blockIdx.x;  // obvious， seq_id = batch_id * beam_size + beam_id
+        blockIdx.x;  // obvious, seq_id = batch_id * beam_size + beam_id
     if (blockIdx.y == 0) {
       // for query
       new_q[seq_id * hidden_size + i] = val;
@@ -841,7 +841,7 @@ __global__ void ker_arrange_decself_qkv<__half>(
     half2 val = __hadd2(
         p_qkv[(blockIdx.x * gridDim.y + blockIdx.y) * half_hidden_size + i],
         __ldg(&p_bias[blockIdx.y * half_hidden_size + i]));
-    // obvious，seq_id = batch_id * beam_size + beam_id
+    // obvious, seq_id = batch_id * beam_size + beam_id
     int seq_id = blockIdx.x;
     if (blockIdx.y == 0) {
       // for query
diff --git a/lightseq/inference/kernels/transformerKernels_int8.cc.cu b/lightseq/inference/kernels/transformerKernels_int8.cc.cu
index 67406048..85c5d736 100644
--- a/lightseq/inference/kernels/transformerKernels_int8.cc.cu
+++ b/lightseq/inference/kernels/transformerKernels_int8.cc.cu
@@ -864,85 +864,72 @@ template void ker_residual_bias_ln_i32I_launcher<half>(
 
 template <typename T>
 __global__ void ker_bias_gelu_i8I_i8O(int8_t *input, int8_t *output,
-                                      const T *bias, int total_count,
-                                      int feature_dim, float dequant_scale,
-                                      float quant_scale, bool in_out_col32) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (i * 4 >= total_count) return;
+                                      const T *bias, int feature_dim,
+                                      float dequant_scale, float quant_scale,
+                                      bool in_col32, bool out_col32) {
+  int block_start = blockIdx.x * feature_dim;
+  int start = block_start + threadIdx.x;
+  int end = block_start + feature_dim;
+  for (int i = start; i < end; i += blockDim.x) {
+    int input_index;
+    if (in_col32) {
+      int row_id = blockIdx.x;
+      int col_id = i - block_start;
+      input_index =
+          row_major2flat_col32(row_id, col_id, gridDim.x, feature_dim);
+    } else {
+      input_index = i;
+    }
 
-  char4 *out4 = reinterpret_cast<char4 *>(output);
-  const char4 *data4 = reinterpret_cast<const char4 *>(input);
-  const float4 *bias4 = reinterpret_cast<const float4 *>(bias);
+    float fout = gelu<float>(float(input[input_index]) * dequant_scale +
+                             __ldg(&bias[i - block_start]));
 
-  int bias_i;
-  if (in_out_col32) {
-    int row_size = total_count / feature_dim;
-    int flat_i = i << 2;
-    int col_id = (flat_i / (row_size * 32)) * 32 + (flat_i & 31);
-    bias_i = col_id >> 2;
-  } else {
-    bias_i = i % (feature_dim >> 2);
+    int output_index;
+    if (out_col32) {
+      int row_id = blockIdx.x;
+      int col_id = i - block_start;
+      output_index =
+          row_major2flat_col32(row_id, col_id, gridDim.x, feature_dim);
+    } else {
+      output_index = i;
+    }
+    output[output_index] = float2int8(fout, quant_scale);
   }
-
-  const char4 input4 = data4[i];
-  const float4 b4 = __ldg(&bias4[bias_i]);
-  float4 output4;
-
-  output4.x = gelu<float>(float(input4.x) * dequant_scale + b4.x);
-  output4.y = gelu<float>(float(input4.y) * dequant_scale + b4.y);
-  output4.z = gelu<float>(float(input4.z) * dequant_scale + b4.z);
-  output4.w = gelu<float>(float(input4.w) * dequant_scale + b4.w);
-
-  char4 out_i4;
-  out_i4.x = float2int8(output4.x, quant_scale);
-  out_i4.y = float2int8(output4.y, quant_scale);
-  out_i4.z = float2int8(output4.z, quant_scale);
-  out_i4.w = float2int8(output4.w, quant_scale);
-  out4[i] = out_i4;
 }
 
 /* fp16 version */
 template <>
-__global__ void ker_bias_gelu_i8I_i8O<__half>(int8_t *input, int8_t *output,
-                                              const __half *bias,
-                                              int total_count, int feature_dim,
-                                              float dequant_scale,
-                                              float quant_scale,
-                                              bool in_out_col32) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (i * 8 >= total_count) return;
-
-  const int2 *vals_int2 = reinterpret_cast<const int2 *>(input);
-  int64_t *outs_i8 = reinterpret_cast<int64_t *>(output);
-  const float4 *bias4 = reinterpret_cast<const float4 *>(bias);
-
-  int bias_i;
-  if (in_out_col32) {
-    int row_size = total_count / feature_dim;
-    int flat_i = i << 3;
-    int col_id = (flat_i / (row_size * 32)) * 32 + (flat_i & 31);
-    bias_i = col_id >> 3;
-  } else {
-    bias_i = i % (feature_dim >> 3);
-  }
+__global__ void ker_bias_gelu_i8I_i8O<__half>(
+    int8_t *input, int8_t *output, const __half *bias, int feature_dim,
+    float dequant_scale, float quant_scale, bool in_col32, bool out_col32) {
+  int block_start = blockIdx.x * feature_dim;
+  int start = block_start + threadIdx.x;
+  int end = block_start + feature_dim;
+  for (int i = start; i < end; i += blockDim.x) {
+    int input_index;
+    if (in_col32) {
+      int row_id = blockIdx.x;
+      int col_id = i - block_start;
+      input_index =
+          row_major2flat_col32(row_id, col_id, gridDim.x, feature_dim);
+    } else {
+      input_index = i;
+    }
 
-  int2 val_int2 = vals_int2[i];
-  int8_t *val1 = reinterpret_cast<int8_t *>(&val_int2);
-  const float4 b4 = __ldg(&bias4[bias_i]);
-  const __half *b_half = reinterpret_cast<const __half *>(&b4);
-  int64_t out_i8;
-  int8_t *out_i1 = reinterpret_cast<int8_t *>(&out_i8);
+    float fout = gelu<float>(float(input[input_index]) * dequant_scale +
+                             __half2float(__ldg(&bias[i - block_start])));
 
-#pragma unroll
-  for (int j = 0; j < 8; ++j) {
-    float out_f;
-    out_f =
-        gelu<float>(float(val1[j]) * dequant_scale + __half2float(b_half[j]));
-    out_i1[j] = float2int8(out_f, quant_scale);
+    int output_index;
+    if (out_col32) {
+      int row_id = blockIdx.x;
+      int col_id = i - block_start;
+      output_index =
+          row_major2flat_col32(row_id, col_id, gridDim.x, feature_dim);
+    } else {
+      output_index = i;
+    }
+    output[output_index] = float2int8(fout, quant_scale);
   }
-  outs_i8[i] = out_i8;
 }
 
 template <typename T>
@@ -950,35 +937,31 @@ void ker_bias_gelu_i8I_i8O_launcher(int batch_token_num, cudaStream_t stream,
                                     int8_t *input, int8_t *output,
                                     const T *bias, int feature_dim,
                                     float dequant_scale, float quant_scale,
-                                    bool in_out_col32) {
-  int total_count = batch_token_num * feature_dim;
-  int grid_dim = total_count >> 10;
-  ker_bias_gelu_i8I_i8O<T><<<grid_dim + 1, 256, 0, stream>>>(
-      input, output, bias, total_count, feature_dim, dequant_scale, quant_scale,
-      in_out_col32);
+                                    bool in_col32, bool out_col32) {
+  ker_bias_gelu_i8I_i8O<T><<<batch_token_num, 1024, 0, stream>>>(
+      input, output, bias, feature_dim, dequant_scale, quant_scale, in_col32,
+      out_col32);
 }
 
 template <>
 void ker_bias_gelu_i8I_i8O_launcher<__half>(
     int batch_token_num, cudaStream_t stream, int8_t *input, int8_t *output,
     const __half *bias, int feature_dim, float dequant_scale, float quant_scale,
-    bool in_out_col32) {
-  int total_count = batch_token_num * feature_dim;
-  int grid_dim = total_count >> 11;
-  ker_bias_gelu_i8I_i8O<__half><<<grid_dim + 1, 256, 0, stream>>>(
-      input, output, bias, total_count, feature_dim, dequant_scale, quant_scale,
-      in_out_col32);
+    bool in_col32, bool out_col32) {
+  ker_bias_gelu_i8I_i8O<__half><<<batch_token_num, 1024, 0, stream>>>(
+      input, output, bias, feature_dim, dequant_scale, quant_scale, in_col32,
+      out_col32);
 }
 
 template void ker_bias_gelu_i8I_i8O_launcher<float>(
     int batch_token_num, cudaStream_t stream, int8_t *input, int8_t *output,
     const float *bias, int feature_dim, float dequant_scale, float quant_scale,
-    bool in_out_col32);
+    bool in_col32, bool out_col32);
 
 template void ker_bias_gelu_i8I_i8O_launcher<__half>(
     int batch_token_num, cudaStream_t stream, int8_t *input, int8_t *output,
     const __half *bias, int feature_dim, float dequant_scale, float quant_scale,
-    bool in_out_col32);
+    bool in_col32, bool out_col32);
 
 template <typename T>
 __global__ void ker_bias_relu_i8I_i8O(int8_t *input, int8_t *output,
@@ -1199,6 +1182,122 @@ template void ker_arrange_encself_qkv_i8I_launcher<__half>(
     int max_batch_dim, int batch_seq_len, int dim_per_head, int head_num,
     int max_thread_per_block, float dequant_scale, bool in_col32);
 
+template <typename T>
+__global__ void ker_arrange_encself_qkv_i8I_i8O(
+    const int8_t *ori_qkv, const T *qkv_bias, int8_t *new_q, int8_t *new_k,
+    int8_t *new_v, T *d_v, int batch_seq_len, int dim_per_head, int head_num,
+    float dequant_scale, float quant_scale, bool in_col32) {
+  int hidden_size = dim_per_head * head_num;
+  int batch_id = blockIdx.x / batch_seq_len;
+  int token_id = blockIdx.x % batch_seq_len;
+  for (std::size_t i = threadIdx.x; i < hidden_size; i += blockDim.x) {
+    int head_id = i / dim_per_head;
+    int dim_id = i % dim_per_head;
+    int target_id = targetid_4dim(batch_id, head_id, token_id, dim_id, head_num,
+                                  batch_seq_len, dim_per_head);
+    int qkv_index;
+    if (in_col32) {
+      int row_id = blockIdx.x;
+      int col_id = blockIdx.y * hidden_size + i;
+      qkv_index = row_major2flat_col32(row_id, col_id, gridDim.x,
+                                       gridDim.y * hidden_size);
+    } else {
+      qkv_index = (blockIdx.x * gridDim.y + blockIdx.y) * hidden_size + i;
+    }
+
+    float val = float(ori_qkv[qkv_index]) * dequant_scale +
+                __ldg(&qkv_bias[blockIdx.y * hidden_size + i]);
+    int8_t quant_val = float2int8(val, quant_scale);
+
+    if (blockIdx.y == 0) {
+      new_q[target_id] = quant_val;
+    } else if (blockIdx.y == 1) {
+      new_k[target_id] = quant_val;
+    } else {
+      new_v[target_id] = quant_val;
+      d_v[target_id] = float(quant_val) / quant_scale;
+    }
+  }
+}
+
+template <>
+__global__ void ker_arrange_encself_qkv_i8I_i8O<__half>(
+    const int8_t *ori_qkv, const __half *qkv_bias, int8_t *new_q, int8_t *new_k,
+    int8_t *new_v, __half *d_v, int batch_seq_len, int dim_per_head,
+    int head_num, float dequant_scale, float quant_scale, bool in_col32) {
+  int hidden_size = dim_per_head * head_num;
+  int batch_id = blockIdx.x / batch_seq_len;
+  int token_id = blockIdx.x % batch_seq_len;
+  for (std::size_t i = threadIdx.x; i < hidden_size; i += blockDim.x) {
+    int head_id = i / dim_per_head;
+    int dim_id = i % dim_per_head;
+    int target_id = targetid_4dim(batch_id, head_id, token_id, dim_id, head_num,
+                                  batch_seq_len, dim_per_head);
+    int qkv_index;
+    if (in_col32) {
+      int row_id = blockIdx.x;
+      int col_id = blockIdx.y * hidden_size + i;
+      qkv_index = row_major2flat_col32(row_id, col_id, gridDim.x,
+                                       gridDim.y * hidden_size);
+    } else {
+      qkv_index = (blockIdx.x * gridDim.y + blockIdx.y) * hidden_size + i;
+    }
+
+    float val = float(ori_qkv[qkv_index]) * dequant_scale +
+                __half2float(__ldg(&qkv_bias[blockIdx.y * hidden_size + i]));
+    int8_t quant_val = float2int8(val, quant_scale);
+
+    if (blockIdx.y == 0) {
+      new_q[target_id] = quant_val;
+    } else if (blockIdx.y == 1) {
+      new_k[target_id] = quant_val;
+    } else {
+      new_v[target_id] = quant_val;
+      d_v[target_id] = __float2half(float(quant_val) / quant_scale);
+    }
+  }
+}
+
+template <typename T>
+void ker_arrange_encself_qkv_i8I_i8O_launcher(
+    int batch_token_num, int hidden_size, cudaStream_t stream,
+    const int8_t *ori_qkv, const T *qkv_bias, int8_t *new_q, int8_t *new_k,
+    int8_t *new_v, T *d_v, int batch_seq_len, int dim_per_head, int head_num,
+    int max_thread_per_block, float dequant_scale, float quant_scale,
+    bool in_col32) {
+  ker_arrange_encself_qkv_i8I_i8O<T>
+      <<<dim3(batch_token_num, 3), max_thread_per_block, 0, stream>>>(
+          ori_qkv, qkv_bias, new_q, new_k, new_v, d_v, batch_seq_len,
+          dim_per_head, head_num, dequant_scale, quant_scale, in_col32);
+}
+
+template <>
+void ker_arrange_encself_qkv_i8I_i8O_launcher<__half>(
+    int batch_token_num, int hidden_size, cudaStream_t stream,
+    const int8_t *ori_qkv, const __half *qkv_bias, int8_t *new_q, int8_t *new_k,
+    int8_t *new_v, __half *d_v, int batch_seq_len, int dim_per_head,
+    int head_num, int max_thread_per_block, float dequant_scale,
+    float quant_scale, bool in_col32) {
+  ker_arrange_encself_qkv_i8I_i8O<__half>
+      <<<dim3(batch_token_num, 3), max_thread_per_block, 0, stream>>>(
+          ori_qkv, qkv_bias, new_q, new_k, new_v, d_v, batch_seq_len,
+          dim_per_head, head_num, dequant_scale, quant_scale, in_col32);
+}
+
+template void ker_arrange_encself_qkv_i8I_i8O_launcher<float>(
+    int batch_token_num, int hidden_size, cudaStream_t stream,
+    const int8_t *ori_qkv, const float *qkv_bias, int8_t *new_q, int8_t *new_k,
+    int8_t *new_v, float *d_v, int batch_seq_len, int dim_per_head,
+    int head_num, int max_thread_per_block, float dequant_scale,
+    float quant_scale, bool in_col32);
+
+template void ker_arrange_encself_qkv_i8I_i8O_launcher<__half>(
+    int batch_token_num, int hidden_size, cudaStream_t stream,
+    const int8_t *ori_qkv, const __half *qkv_bias, int8_t *new_q, int8_t *new_k,
+    int8_t *new_v, __half *d_v, int batch_seq_len, int dim_per_head,
+    int head_num, int max_thread_per_block, float dequant_scale,
+    float quant_scale, bool in_col32);
+
 template <typename T>
 __global__ void ker_arrange_atten_output_i8O(const T *ori_q, int8_t *new_q,
                                              int beam_size, int dim_per_head,
@@ -1294,7 +1393,7 @@ template void ker_arrange_atten_output_i8O_launcher<__half>(
     int head_num, int max_thread_per_block, float quant_scale, bool out_col32);
 
 template <typename T>
-__global__ void ker_arrange_decself_qkv_i8I(
+__global__ void ker_arrange_decself_qkv_i8I_i8O(
     const int8_t *ori_qkv, const T *qkv_bias, int8_t *new_q, int8_t *new_k,
     int8_t *new_v, int head_num, int dim_per_head, int max_step, int step_id,
     float dequant_scale, float quant_scale, bool in_col32) {
@@ -1313,7 +1412,7 @@ __global__ void ker_arrange_decself_qkv_i8I(
                 __ldg(&qkv_bias[blockIdx.y * hidden_size + i]);
     int8_t quant_val = float2int8(val, quant_scale);
     int seq_id =
-        blockIdx.x;  // obvious， seq_id = batch_id * beam_size + beam_id
+        blockIdx.x;  // obvious, seq_id = batch_id * beam_size + beam_id
     if (blockIdx.y == 0) {
       // for query
       new_q[seq_id * hidden_size + i] = quant_val;
@@ -1334,7 +1433,7 @@ __global__ void ker_arrange_decself_qkv_i8I(
 }
 
 template <>
-__global__ void ker_arrange_decself_qkv_i8I<__half>(
+__global__ void ker_arrange_decself_qkv_i8I_i8O<__half>(
     const int8_t *ori_qkv, const __half *qkv_bias, int8_t *new_q, int8_t *new_k,
     int8_t *new_v, int head_num, int dim_per_head, int max_step, int step_id,
     float dequant_scale, float quant_scale, bool in_col32) {
@@ -1353,7 +1452,7 @@ __global__ void ker_arrange_decself_qkv_i8I<__half>(
                 __half2float(__ldg(&qkv_bias[blockIdx.y * hidden_size + i]));
     int8_t quant_val = float2int8(val, quant_scale);
     int seq_id =
-        blockIdx.x;  // obvious， seq_id = batch_id * beam_size + beam_id
+        blockIdx.x;  // obvious, seq_id = batch_id * beam_size + beam_id
     if (blockIdx.y == 0) {
       // for query
       new_q[seq_id * hidden_size + i] = quant_val;
@@ -1374,39 +1473,39 @@ __global__ void ker_arrange_decself_qkv_i8I<__half>(
 }
 
 template <typename T>
-void ker_arrange_decself_qkv_i8I_launcher(
+void ker_arrange_decself_qkv_i8I_i8O_launcher(
     int step_token_num, int hidden_size, cudaStream_t stream,
     const int8_t *ori_qkv, const T *qkv_bias, int8_t *new_q, int8_t *new_k,
     int8_t *new_v, int head_num, int dim_per_head, int max_step, int step_id,
     int max_thread_per_block, float dequant_scale, float quant_scale,
     bool in_col32) {
-  ker_arrange_decself_qkv_i8I<T>
+  ker_arrange_decself_qkv_i8I_i8O<T>
       <<<dim3(step_token_num, 3), max_thread_per_block, 0, stream>>>(
           ori_qkv, qkv_bias, new_q, new_k, new_v, head_num, dim_per_head,
           max_step, step_id, dequant_scale, quant_scale, in_col32);
 }
 
 // template <>
-// void ker_arrange_decself_qkv_i8I_launcher<__half>(
+// void ker_arrange_decself_qkv_i8I_i8O_launcher<__half>(
 //     int step_token_num, int hidden_size, cudaStream_t stream,
 //     const int8_t *ori_qkv, const __half *qkv_bias, int8_t *new_q, int8_t
 //     *new_k, int8_t *new_v, int head_num, int dim_per_head, int max_step, int
 //     step_id, int max_thread_per_block, float dequant_scale, float
 //     quant_scale, bool in_col32) {
-//   ker_arrange_decself_qkv_i8I<__half>
+//   ker_arrange_decself_qkv_i8I_i8O<__half>
 //       <<<dim3(step_token_num, 3), max_thread_per_block, 0, stream>>>(
 //           ori_qkv, qkv_bias, new_q, new_k, new_v, head_num, dim_per_head,
 //           max_step, step_id, dequant_scale, quant_scale, in_col32);
 // }
 
-template void ker_arrange_decself_qkv_i8I_launcher<float>(
+template void ker_arrange_decself_qkv_i8I_i8O_launcher<float>(
     int step_token_num, int hidden_size, cudaStream_t stream,
     const int8_t *ori_qkv, const float *qkv_bias, int8_t *new_q, int8_t *new_k,
     int8_t *new_v, int head_num, int dim_per_head, int max_step, int step_id,
     int max_thread_per_block, float dequant_scale, float quant_scale,
     bool in_col32);
 
-template void ker_arrange_decself_qkv_i8I_launcher<__half>(
+template void ker_arrange_decself_qkv_i8I_i8O_launcher<__half>(
     int step_token_num, int hidden_size, cudaStream_t stream,
     const int8_t *ori_qkv, const __half *qkv_bias, int8_t *new_q, int8_t *new_k,
     int8_t *new_v, int head_num, int dim_per_head, int max_step, int step_id,
@@ -1414,7 +1513,7 @@ template void ker_arrange_decself_qkv_i8I_launcher<__half>(
     bool in_col32);
 
 /**
-@brief: ker_fuse_softmax_new_value_int8
+@brief: ker_fuse_softmax_new_value_i32I_i8O
 fused query-key correlation softmax and new_value for decoder self attention
 
 @thread
@@ -1424,10 +1523,10 @@ blockDim.x = first multiple of WARP_SIZE greater than cur_step + 1
 @param
 correlation: [batch_size, beam_size, head_num, cur_step + 1]
 */
-__global__ void ker_fuse_softmax_new_value_int8(
+__global__ void ker_fuse_softmax_new_value_i32I_i8O(
     const int32_t *logits, const int8_t *v, int8_t *new_v, int step_num,
     int max_step, int head_num, int dim_per_head, float attn_scale,
-    float dequant_scale, float quant_scale, bool col32_out) {
+    float dequant_scale, float quant_scale, bool out_col32) {
   int idx = blockIdx.x * max_step + threadIdx.x;
   float val = threadIdx.x < step_num ? float(logits[idx]) * dequant_scale *
                                            dequant_scale * attn_scale
@@ -1470,28 +1569,28 @@ __global__ void ker_fuse_softmax_new_value_int8(
     int col = head_idx * dim_per_head + i;
     int col_size = head_num * dim_per_head;
     int new_v_idx = row * col_size + col;
-    if (col32_out) {
+    if (out_col32) {
       new_v_idx = row_major2flat_col32(row, col, row_size, col_size);
     }
     new_v[new_v_idx] = float2int8(block_new_value[i], quant_scale);
   }
 }
 
-void ker_fuse_softmax_new_value_int8_launcher(
+void ker_fuse_softmax_new_value_i32I_i8O_launcher(
     const int32_t *correlation, const int8_t *v, int8_t *new_v,
     int batch_head_num, int step_num, int max_step, int head_num,
     int dim_per_head, float attn_scale, float dequant_scale, float quant_scale,
-    bool col32_out, cudaStream_t stream) {
+    bool out_col32, cudaStream_t stream) {
   int block_dim = step_num;
   if (step_num < 1024) {
     block_dim = (step_num + 31) >> 5;
     block_dim *= 32;
   }
-  ker_fuse_softmax_new_value_int8<<<
+  ker_fuse_softmax_new_value_i32I_i8O<<<
       batch_head_num, block_dim,
       dim_per_head * sizeof(float) + step_num * sizeof(float), stream>>>(
       correlation, v, new_v, step_num, max_step, head_num, dim_per_head,
-      attn_scale, dequant_scale, quant_scale, col32_out);
+      attn_scale, dequant_scale, quant_scale, out_col32);
 }
 
 template <typename T>
@@ -1806,5 +1905,420 @@ template void select_beam_rough_topk_i8I_launcher<__half>(
     int max_thread_per_block, cudaStream_t stream, int beam_size,
     float diverse_lambda, int end_id, bool in_col32);
 
+template <typename T, int k>
+__global__ void ker_topk_sample_i8I(const int8_t *logits, const T *logit_bias,
+                                    int *old_input_ids, int *new_input_ids,
+                                    const int vocab_size, const int max_step,
+                                    const int batch_seq_len, int logits_seq_len,
+                                    int *unfinished, curandState *curandstate,
+                                    int eos_id, float dequant_scale,
+                                    bool in_col32) {
+  int last_token_idx_in_batch = blockIdx.x * max_step + batch_seq_len - 1;
+
+  /* add EOS to end if last token is EOS */
+  if (batch_seq_len > 1 && old_input_ids[last_token_idx_in_batch] == eos_id) {
+    if (threadIdx.x == 0) {
+      old_input_ids[last_token_idx_in_batch + 1] = eos_id;
+    }
+    return;
+  }
+  int logits_token_idx_in_batch =
+      blockIdx.x * logits_seq_len + logits_seq_len - 1;
+  int left_logit_idx = logits_token_idx_in_batch * vocab_size + threadIdx.x;
+  int right_logit_idx = (logits_token_idx_in_batch + 1) * vocab_size;
+
+  /*
+  step1. find max logit and rough Kth logit over the whole vocab
+  */
+  __shared__ float s_max_logit, s_topk_logit;
+  float rough_top_kth_logit = CUDA_FLOAT_INF_NEG;
+  for (int idx = left_logit_idx; idx < right_logit_idx; idx += blockDim.x) {
+    int logits_idx;
+    if (in_col32) {
+      int row_id = logits_token_idx_in_batch;
+      int col_id = idx - logits_token_idx_in_batch * vocab_size;
+      logits_idx = row_major2flat_col32(row_id, col_id,
+                                        gridDim.x * logits_seq_len, vocab_size);
+    } else {
+      logits_idx = idx;
+    }
+    rough_top_kth_logit = fmaxf(
+        rough_top_kth_logit,
+        (float)(logits[logits_idx]) * dequant_scale +
+            (float)__ldg(&logit_bias[idx - left_logit_idx + threadIdx.x]));
+  }
+  float max_logit = blockReduceMax(rough_top_kth_logit);
+  rough_top_kth_logit = blockRoughTopK<float, k>(rough_top_kth_logit);
+  if (threadIdx.x == 0) {
+    s_topk_logit = rough_top_kth_logit;
+    s_max_logit = max_logit;
+  }
+  __syncthreads();
+
+  __shared__ int s_tid;
+
+  if (k != 1) {
+    /* step2 hold one logit per thread which larger than Kth logit and sample
+     * from them */
+    float topk_exp_sum, topk_exp = CUDA_FLOAT_INF_NEG;
+    int topk_tid = vocab_size;
+    // int test_num = 0;
+    __shared__ float s_topk_exp_sum;
+    for (int idx = left_logit_idx; idx < right_logit_idx; idx += blockDim.x) {
+      int logits_idx;
+      if (in_col32) {
+        int row_id = logits_token_idx_in_batch;
+        int col_id = idx - logits_token_idx_in_batch * vocab_size;
+        logits_idx = row_major2flat_col32(
+            row_id, col_id, gridDim.x * logits_seq_len, vocab_size);
+      } else {
+        logits_idx = idx;
+      }
+      float logit =
+          (float)logits[logits_idx] * dequant_scale +
+          (float)__ldg(&logit_bias[idx - left_logit_idx + threadIdx.x]);
+      float logit_exp = expf(fmaxf(logit - s_max_logit, logit_thresh_min));
+      // if (logit >= s_topk_logit) test_num++;
+      if (logit >= s_topk_logit && logit_exp > topk_exp) {
+        topk_exp = logit_exp;
+        topk_tid = idx - left_logit_idx + threadIdx.x;
+      }
+    }
+
+    // test_num = blockReduceSum(test_num);
+    // __shared__ int s_test_num;
+    // if (threadIdx.x == 0) {
+    //   s_test_num = test_num;
+    //   if (s_test_num != 1) printf("sample from top %d\n", s_test_num);
+    //   // printf("sample from top %s", test_num);
+    // }
+    // __syncthreads();
+
+    if (topk_tid == vocab_size) topk_exp = 0;
+    topk_exp_sum = blockReduceSum(topk_exp);
+    if (threadIdx.x == 0) {
+      s_topk_exp_sum = topk_exp_sum;
+    }
+    __syncthreads();
+
+    /* calculate cumulative probability */
+    float topk_prob = topk_exp / s_topk_exp_sum;
+    float prefix_sum_prob;
+    typedef cub::BlockScan<float, 1024> BlockScan;
+    __shared__ typename BlockScan::TempStorage temp_storage;
+    BlockScan(temp_storage).InclusiveSum(topk_prob, prefix_sum_prob);
+
+    __shared__ float random_x;
+    if (threadIdx.x == 0) {
+      random_x = curand_uniform(curandstate + blockIdx.x);
+    }
+    __syncthreads();
+
+    if (threadIdx.x == 0) {
+      s_tid = vocab_size;
+    }
+    __syncthreads();
+
+    int threadID = threadIdx.x;
+    __shared__ int s_threadID;
+    __shared__ float s_max_prob;
+    if (random_x > prefix_sum_prob) threadID = blockDim.x;
+    threadID = blockReduceMin(threadID);
+    float max_prob = blockReduceMax(topk_prob);
+    if (threadIdx.x == 0) {
+      s_threadID = threadID;
+      s_max_prob = max_prob;
+    }
+    __syncthreads();
+    if (threadIdx.x == s_threadID) {
+      s_tid = topk_tid;
+    }
+    __syncthreads();
+
+    if (s_tid == vocab_size && topk_prob == s_max_prob) {
+      s_tid = topk_tid;
+    }
+    __syncthreads();
+  } else {
+    s_tid = vocab_size;
+    for (int idx = left_logit_idx; idx < right_logit_idx; idx += blockDim.x) {
+      int logits_idx;
+      if (in_col32) {
+        int row_id = logits_token_idx_in_batch;
+        int col_id = idx - logits_token_idx_in_batch * vocab_size;
+        logits_idx = row_major2flat_col32(
+            row_id, col_id, gridDim.x * logits_seq_len, vocab_size);
+      } else {
+        logits_idx = idx;
+      }
+      float logit =
+          (float)logits[logits_idx] * dequant_scale +
+          (float)__ldg(&logit_bias[idx - left_logit_idx + threadIdx.x]);
+      if (logit == s_max_logit) {
+        s_tid = idx - left_logit_idx + threadIdx.x;
+      }
+    }
+    __syncthreads();
+  }
+
+  /* if new sampled tid is not EOS, set unfinish TRUE */
+  if (threadIdx.x == 0) {
+    if (s_tid != eos_id) unfinished[0] = 1;
+  }
+
+  /* step3 write back new sampled ids */
+  if (threadIdx.x == 0) {
+    old_input_ids[last_token_idx_in_batch + 1] = s_tid;
+  }
+}
+
+template <typename T>
+void ker_topk_sample_i8I_launcher(
+    int batch_size, int batch_seq_len, const int max_step, int logits_seq_len,
+    int max_thread_per_block, cudaStream_t stream, const int8_t *logits,
+    const T *logit_bias, int *old_input_ids, int *new_input_ids,
+    const int vocab_size, const int k, int *unfinished,
+    curandState *curandstate, int eos_id, float dequant_scale, bool in_col32) {
+  if (k == 1)
+    ker_topk_sample_i8I<T, 1><<<batch_size, max_thread_per_block, 0, stream>>>(
+        logits, logit_bias, old_input_ids, new_input_ids, vocab_size, max_step,
+        batch_seq_len, logits_seq_len, unfinished, curandstate, eos_id,
+        dequant_scale, in_col32);
+  else if (k == 2)
+    ker_topk_sample_i8I<T, 2><<<batch_size, max_thread_per_block, 0, stream>>>(
+        logits, logit_bias, old_input_ids, new_input_ids, vocab_size, max_step,
+        batch_seq_len, logits_seq_len, unfinished, curandstate, eos_id,
+        dequant_scale, in_col32);
+  else if (k == 4)
+    ker_topk_sample_i8I<T, 4><<<batch_size, max_thread_per_block, 0, stream>>>(
+        logits, logit_bias, old_input_ids, new_input_ids, vocab_size, max_step,
+        batch_seq_len, logits_seq_len, unfinished, curandstate, eos_id,
+        dequant_scale, in_col32);
+  else if (k == 8)
+    ker_topk_sample_i8I<T, 8><<<batch_size, max_thread_per_block, 0, stream>>>(
+        logits, logit_bias, old_input_ids, new_input_ids, vocab_size, max_step,
+        batch_seq_len, logits_seq_len, unfinished, curandstate, eos_id,
+        dequant_scale, in_col32);
+  else if (k == 16)
+    ker_topk_sample_i8I<T, 16><<<batch_size, max_thread_per_block, 0, stream>>>(
+        logits, logit_bias, old_input_ids, new_input_ids, vocab_size, max_step,
+        batch_seq_len, logits_seq_len, unfinished, curandstate, eos_id,
+        dequant_scale, in_col32);
+  else if (k == 32)
+    ker_topk_sample_i8I<T, 32><<<batch_size, max_thread_per_block, 0, stream>>>(
+        logits, logit_bias, old_input_ids, new_input_ids, vocab_size, max_step,
+        batch_seq_len, logits_seq_len, unfinished, curandstate, eos_id,
+        dequant_scale, in_col32);
+  else {
+    throw std::invalid_argument("topk argument should be in [1,2,4,8,16,32]");
+  }
+}
+
+template void ker_topk_sample_i8I_launcher<float>(
+    int batch_size, int batch_seq_len, const int max_step, int logits_seq_len,
+    int max_thread_per_block, cudaStream_t stream, const int8_t *logits,
+    const float *logit_bias, int *old_input_ids, int *new_input_idx,
+    const int vocab_size, const int k, int *unfinished,
+    curandState *curandstate, int eos_id, float dequant_scale, bool in_col32);
+
+template void ker_topk_sample_i8I_launcher<__half>(
+    int batch_size, int batch_seq_len, const int max_step, int logits_seq_len,
+    int max_thread_per_block, cudaStream_t stream, const int8_t *logits,
+    const __half *logit_bias, int *old_input_ids, int *new_input_idx,
+    const int vocab_size, const int k, int *unfinished,
+    curandState *curandstate, int eos_id, float dequant_scale, bool in_col32);
+
+template <typename T>
+__global__ void ker_topp_sample_i8I(const int8_t *logits, const T *logit_bias,
+                                    int *old_input_ids, int *new_input_ids,
+                                    const int vocab_size, const int max_step,
+                                    const int batch_seq_len, int logits_seq_len,
+                                    int *unfinished, float p,
+                                    curandState *curandstate, int eos_id,
+                                    float dequant_scale, bool in_col32) {
+  int token_idx_in_batch = blockIdx.x * max_step + batch_seq_len - 1;
+
+  /* add EOS to end if last token is EOS */
+  if (batch_seq_len > 1 && old_input_ids[token_idx_in_batch] == eos_id) {
+    if (threadIdx.x == 0) {
+      old_input_ids[token_idx_in_batch + 1] = eos_id;
+    }
+    return;
+  }
+  int logits_token_idx_in_batch =
+      blockIdx.x * logits_seq_len + logits_seq_len - 1;
+  int left_logit_idx = logits_token_idx_in_batch * vocab_size + threadIdx.x;
+  int right_logit_idx = (logits_token_idx_in_batch + 1) * vocab_size;
+
+  /* step1. find max logit in each thread and sample from these probs with
+   * nucleus sampling */
+  __shared__ float s_max_logit;
+  float max_logit = CUDA_FLOAT_INF_NEG;
+  for (int idx = left_logit_idx; idx < right_logit_idx; idx += blockDim.x) {
+    int logits_idx;
+    if (in_col32) {
+      int row_id = logits_token_idx_in_batch;
+      int col_id = idx - logits_token_idx_in_batch * vocab_size;
+      logits_idx = row_major2flat_col32(row_id, col_id,
+                                        gridDim.x * logits_seq_len, vocab_size);
+    } else {
+      logits_idx = idx;
+    }
+    max_logit = fmaxf(max_logit, (float)logits[logits_idx] * dequant_scale) +
+                (float)__ldg(&logit_bias[idx - left_logit_idx + threadIdx.x]);
+  }
+  float max_logit_array[1];
+  max_logit_array[0] = max_logit;
+  typedef cub::BlockRadixSort<float, 1024, 1> BlockRadixSort;
+  __shared__ typename BlockRadixSort::TempStorage sort_temp_storage;
+  BlockRadixSort(sort_temp_storage).SortDescending(max_logit_array);
+  float presum_max_logit_exp;
+  max_logit = max_logit_array[0];
+
+  float block_max_logit = blockReduceMax(max_logit);
+  if (threadIdx.x == 0) {
+    s_max_logit = block_max_logit;
+  }
+  __syncthreads();
+
+  float biased_logit_exp =
+      expf(fmaxf(max_logit - s_max_logit, logit_thresh_min));
+
+  typedef cub::BlockScan<float, 1024> BlockScan;
+  __shared__ typename BlockScan::TempStorage presum_temp_storage;
+  BlockScan(presum_temp_storage)
+      .InclusiveSum(biased_logit_exp, presum_max_logit_exp);
+
+  float topp_exp_threshold;
+  if (threadIdx.x == blockDim.x - 1) {
+    topp_exp_threshold = p * presum_max_logit_exp;
+  }
+  __shared__ float s_presum_logit_exp_threshold;
+  if (presum_max_logit_exp > topp_exp_threshold) {
+    presum_max_logit_exp = CUDA_FLOAT_INF_NEG;
+  }
+  float logit_exp_threshold = blockReduceMax(presum_max_logit_exp);
+  if (threadIdx.x == 0) {
+    s_presum_logit_exp_threshold = logit_exp_threshold;
+  }
+  __syncthreads();
+
+  __shared__ float s_logit_threshold;
+  if (presum_max_logit_exp == s_presum_logit_exp_threshold) {
+    s_logit_threshold = max_logit;
+  }
+  __syncthreads();
+
+  /* step2 hold one logit per thread which larger than Kth logit and sample
+   * from them */
+  float topk_exp_sum, topk_exp = CUDA_FLOAT_INF_NEG;
+  int topk_tid = vocab_size;
+  int test_num = 0;
+  __shared__ float s_topk_exp_sum;
+  for (int idx = left_logit_idx; idx < right_logit_idx; idx += blockDim.x) {
+    int logits_idx;
+    if (in_col32) {
+      int row_id = logits_token_idx_in_batch;
+      int col_id = idx - logits_token_idx_in_batch * vocab_size;
+      logits_idx = row_major2flat_col32(row_id, col_id,
+                                        gridDim.x * logits_seq_len, vocab_size);
+    } else {
+      logits_idx = idx;
+    }
+    float logit = (float)logits[logits_idx] * dequant_scale +
+                  (float)__ldg(&logit_bias[idx - left_logit_idx + threadIdx.x]);
+    float logit_exp = expf(fmaxf(logit - s_max_logit, logit_thresh_min));
+    if (logit >= s_logit_threshold) test_num++;
+    if (logit >= s_logit_threshold && logit_exp > topk_exp) {
+      topk_exp = logit_exp;
+      topk_tid = idx - left_logit_idx + threadIdx.x;
+    }
+  }
+
+  test_num = blockReduceSum(test_num);
+
+  if (topk_tid == vocab_size) topk_exp = 0;
+  topk_exp_sum = blockReduceSum(topk_exp);
+  if (threadIdx.x == 0) {
+    s_topk_exp_sum = topk_exp_sum;
+  }
+  __syncthreads();
+
+  /* calculate cumulative probability */
+  float topk_prob = topk_exp / s_topk_exp_sum;
+  float prefix_sum_prob;
+  BlockScan(presum_temp_storage).InclusiveSum(topk_prob, prefix_sum_prob);
+
+  __shared__ float random_x;
+  if (threadIdx.x == 0) {
+    random_x = curand_uniform(curandstate + blockIdx.x);
+  }
+  __syncthreads();
+
+  __shared__ int s_tid;
+  if (threadIdx.x == 0) {
+    s_tid = vocab_size;
+  }
+  __syncthreads();
+
+  int threadID = threadIdx.x;
+  __shared__ int s_threadID;
+  __shared__ float s_max_prob;
+  if (random_x > prefix_sum_prob) threadID = blockDim.x;
+  threadID = blockReduceMin(threadID);
+  float max_prob = blockReduceMax(topk_prob);
+  if (threadIdx.x == 0) {
+    s_threadID = threadID;
+    s_max_prob = max_prob;
+  }
+  __syncthreads();
+  if (threadIdx.x == s_threadID) {
+    s_tid = topk_tid;
+  }
+  __syncthreads();
+
+  if (s_tid == vocab_size && topk_prob == s_max_prob) {
+    s_tid = topk_tid;
+  }
+  __syncthreads();
+
+  /* if new sampled tid is not EOS, set unfinish TRUE */
+  if (threadIdx.x == 0) {
+    if (s_tid != eos_id) unfinished[0] = 1;
+  }
+
+  /* step3 write back new sampled ids */
+  if (threadIdx.x == 0) {
+    old_input_ids[token_idx_in_batch + 1] = s_tid;
+  }
+}
+
+template <typename T>
+void ker_topp_sample_i8I_launcher(
+    int batch_size, int batch_seq_len, const int max_step, int logits_seq_len,
+    int max_thread_per_block, cudaStream_t stream, const int8_t *logits,
+    const T *logit_bias, int *old_input_ids, int *new_input_ids,
+    const int vocab_size, const float p, int *unfinished,
+    curandState *curandstate, int eos_id, float dequant_scale, bool in_col32) {
+  ker_topp_sample_i8I<T><<<batch_size, max_thread_per_block, 0, stream>>>(
+      logits, logit_bias, old_input_ids, new_input_ids, vocab_size, max_step,
+      batch_seq_len, logits_seq_len, unfinished, p, curandstate, eos_id,
+      dequant_scale, in_col32);
+}
+
+template void ker_topp_sample_i8I_launcher<float>(
+    int batch_size, int batch_seq_len, const int max_step, int logits_seq_len,
+    int max_thread_per_block, cudaStream_t stream, const int8_t *logits,
+    const float *logit_bias, int *old_input_ids, int *new_input_idx,
+    const int vocab_size, const float p, int *unfinished,
+    curandState *curandstate, int eos_id, float dequant_scale, bool in_col32);
+
+template void ker_topp_sample_i8I_launcher<__half>(
+    int batch_size, int batch_seq_len, const int max_step, int logits_seq_len,
+    int max_thread_per_block, cudaStream_t stream, const int8_t *logits,
+    const __half *logit_bias, int *old_input_ids, int *new_input_idx,
+    const int vocab_size, const float p, int *unfinished,
+    curandState *curandstate, int eos_id, float dequant_scale, bool in_col32);
+
 }  // namespace cuda
 }  // namespace lightseq
diff --git a/lightseq/inference/kernels/transformerKernels_int8.h b/lightseq/inference/kernels/transformerKernels_int8.h
index 247943ed..cfe7690a 100644
--- a/lightseq/inference/kernels/transformerKernels_int8.h
+++ b/lightseq/inference/kernels/transformerKernels_int8.h
@@ -2,6 +2,7 @@
 
 #include <cuda_runtime.h>
 #include <cuda_fp16.h>
+#include <curand_kernel.h>
 #include <cstdint>
 
 namespace lightseq {
@@ -30,7 +31,8 @@ void ker_bias_gelu_i8I_i8O_launcher(int batch_token_num, cudaStream_t stream,
                                     int8_t *input, int8_t *output,
                                     const T *bias, int feature_dim,
                                     float dequant_scale, float quant_scale,
-                                    bool in_out_col32 = false);
+                                    bool in_col32 = false,
+                                    bool out_col32 = false);
 
 // TODO: remove clip_max
 template <typename T>
@@ -38,8 +40,8 @@ void ker_bias_relu_i8I_i8O_launcher(int batch_token_num, cudaStream_t stream,
                                     int8_t *input, int8_t *output,
                                     const T *bias, int feature_dim,
                                     float dequant_scale, float quant_scale,
-                                    float clip_max, bool in_col32 = true,
-                                    bool out_col32 = true,
+                                    float clip_max, bool in_col32 = false,
+                                    bool out_col32 = false,
                                     bool narrow_clip = false);
 
 template <typename T>
@@ -47,16 +49,16 @@ void ker_residual_bias_ln_i32I_i8O_launcher(
     const int32_t *input, const T *scale, const T *bias, const T *residual_bias,
     int8_t *output, T *residual, int batch_tokens, int hidden_size,
     float dequant_scale, float quant_scale, int max_thread_per_block,
-    cudaStream_t stream, bool is_post_ln = false, bool in_col32 = true,
-    bool out_col32 = true, const T *colsum = nullptr);
+    cudaStream_t stream, bool is_post_ln = false, bool in_col32 = false,
+    bool out_col32 = false, const T *colsum = nullptr);
 
 template <typename T>
 void ker_residual_bias_ln_i8I_i8O_launcher(
     const int8_t *input, const T *scale, const T *bias, const T *residual_bias,
     int8_t *output, T *residual, int batch_tokens, int hidden_size,
     float dequant_scale, float quant_scale, int max_thread_per_block,
-    cudaStream_t stream, bool is_post_ln = false, bool in_col32 = true,
-    bool out_col32 = true, const T *colsum = nullptr);
+    cudaStream_t stream, bool is_post_ln = false, bool in_col32 = false,
+    bool out_col32 = false, const T *colsum = nullptr);
 
 template <typename T>
 void ker_residual_bias_ln_i32I_launcher(
@@ -72,6 +74,14 @@ void ker_arrange_encself_qkv_i8I_launcher(
     int batch_seq_len, int dim_per_head, int head_num, int max_thread_per_block,
     float dequant_scale, bool in_col32 = false);
 
+template <typename T>
+void ker_arrange_encself_qkv_i8I_i8O_launcher(
+    int batch_token_num, int hidden_size, cudaStream_t stream,
+    const int8_t *ori_qkv, const T *qkv_bias, int8_t *new_q, int8_t *new_k,
+    int8_t *new_v, T *d_v, int batch_seq_len, int dim_per_head, int head_num,
+    int max_thread_per_block, float dequant_scale, float quant_scale,
+    bool in_col32 = false);
+
 template <typename T>
 void ker_arrange_atten_output_i8O_launcher(
     int batch_token_num, int hidden_size, cudaStream_t stream, const T *ori_q,
@@ -79,17 +89,17 @@ void ker_arrange_atten_output_i8O_launcher(
     int max_thread_per_block, float quant_scale, bool out_col32 = false);
 
 template <typename T>
-void ker_arrange_decself_qkv_i8I_launcher(
+void ker_arrange_decself_qkv_i8I_i8O_launcher(
     int step_token_num, int hidden_size, cudaStream_t stream,
     const int8_t *ori_qkv, const T *qkv_bias, int8_t *new_q, int8_t *new_k,
     int8_t *new_v, int head_num, int dim_per_head, int max_step, int step_id,
     int max_thread_per_block, float dequant_scale, float quant_scale,
     bool in_col32 = false);
 
-void ker_fuse_softmax_new_value_int8_launcher(
+void ker_fuse_softmax_new_value_i32I_i8O_launcher(
     const int32_t *correlation, const int8_t *v, int8_t *new_v,
     int batch_head_num, int step_num, int max_step, int head_num, int head_dim,
-    float attn_scale, float dequant_scale, float quant_scale, bool col32_out,
+    float attn_scale, float dequant_scale, float quant_scale, bool out_col32,
     cudaStream_t stream);
 
 template <typename T>
@@ -110,5 +120,27 @@ void select_beam_rough_topk_i8I_launcher(
     int max_thread_per_block, cudaStream_t stream, int beam_size,
     float diverse_lambda, int end_id, bool in_col32 = false);
 
+template <typename T>
+void ker_topk_sample_i8I_launcher(int batch_size, int batch_seq_len,
+                                  const int max_step, int logits_seq_len,
+                                  int max_thread_per_block, cudaStream_t stream,
+                                  const int8_t *logits, const T *logit_bias,
+                                  int *old_input_ids, int *new_input_ids,
+                                  const int vocab_size, const int k,
+                                  int *all_finished, curandState *curandstate,
+                                  int eos_id, float dequant_scale,
+                                  bool in_col32 = false);
+
+template <typename T>
+void ker_topp_sample_i8I_launcher(int batch_size, int batch_seq_len,
+                                  const int max_step, int logits_seq_len,
+                                  int max_thread_per_block, cudaStream_t stream,
+                                  const int8_t *logits, const T *logit_bias,
+                                  int *old_input_ids, int *new_input_ids,
+                                  const int vocab_size, const float p,
+                                  int *unfinished, curandState *curandstate,
+                                  int eos_id, float dequant_scale,
+                                  bool in_col32 = false);
+
 }  // namespace cuda
 }  // namespace lightseq
diff --git a/lightseq/inference/model/CMakeLists.txt b/lightseq/inference/model/CMakeLists.txt
index 16275320..c28ec64a 100644
--- a/lightseq/inference/model/CMakeLists.txt
+++ b/lightseq/inference/model/CMakeLists.txt
@@ -42,6 +42,18 @@ endif()
 
 target_include_directories(gpt_model PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 
+add_library(quant_gpt_model STATIC quant_gpt_encoder.cc.cu)
+target_link_libraries(quant_gpt_model PUBLIC cuda_kernels)
+target_link_libraries(quant_gpt_model PUBLIC quant_gpt_weight)
+if(DYNAMIC_API)
+  target_link_libraries(quant_gpt_model PRIVATE CUDA::cublas CUDA::cublasLt)
+else()
+  target_link_libraries(quant_gpt_model PRIVATE CUDA::cublas_static
+                                                CUDA::cublasLt_static)
+endif()
+
+target_include_directories(quant_gpt_model PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+
 add_library(bert_model STATIC bert_encoder.cc.cu)
 target_link_libraries(bert_model PUBLIC cuda_kernels)
 target_link_libraries(bert_model PUBLIC bert_weight)
@@ -52,6 +64,16 @@ else()
                                            CUDA::cublasLt_static)
 endif()
 
+add_library(quant_bert_model STATIC quant_bert_encoder.cc.cu)
+target_link_libraries(quant_bert_model PUBLIC cuda_kernels)
+target_link_libraries(quant_bert_model PUBLIC quant_bert_weight)
+if(DYNAMIC_API)
+  target_link_libraries(quant_bert_model PRIVATE CUDA::cublas CUDA::cublasLt)
+else()
+  target_link_libraries(quant_bert_model PRIVATE CUDA::cublas_static
+                                                 CUDA::cublasLt_static)
+endif()
+
 set(moe_files moe_decoder.cc.cu moe_encoder.cc.cu)
 add_library(moe_model STATIC ${moe_files})
 target_link_libraries(moe_model PUBLIC cuda_kernels)
diff --git a/lightseq/inference/model/encoder.h b/lightseq/inference/model/encoder.h
index b54bf6b7..fe204dcb 100644
--- a/lightseq/inference/model/encoder.h
+++ b/lightseq/inference/model/encoder.h
@@ -17,7 +17,7 @@
 
 /**
 @file
-Transformer decoder, composed by gemm lib and
+Transformer encoder, composed by gemm lib and
   custom cuda kernel function
 */
 
diff --git a/lightseq/inference/model/gpt_encoder.h b/lightseq/inference/model/gpt_encoder.h
index 3ea74f6a..8ca2856f 100644
--- a/lightseq/inference/model/gpt_encoder.h
+++ b/lightseq/inference/model/gpt_encoder.h
@@ -53,7 +53,7 @@ class GptEncoder {
   std::vector<int> _h_sample_id;
   int _h_unfinished;
 
-  // gpu memeory buffer
+  // gpu memory buffer
   _DataType *_p_d_query;
   _DataType *_p_d_k_cache;
   _DataType *_p_d_v_cache;
diff --git a/lightseq/inference/model/quant_bert_encoder.cc.cu b/lightseq/inference/model/quant_bert_encoder.cc.cu
new file mode 100644
index 00000000..c02b90ea
--- /dev/null
+++ b/lightseq/inference/model/quant_bert_encoder.cc.cu
@@ -0,0 +1,483 @@
+#include "quant_bert_encoder.h"
+#include "../kernels/embKernels_int8.h"
+#include "../kernels/transformerKernels.h"
+#include "../kernels/transformerKernels_int8.h"
+#include "cublas_helper.h"
+
+/**
+@file
+QuantBert encoder, composed by gemm lib and
+  custom cuda kernel function
+*/
+
+namespace lightseq {
+namespace cuda {
+
+template <OperationType OpType_>
+QuantBertEncoder<OpType_>::QuantBertEncoder(
+    int max_batch_size, const int *p_d_token_id, int *p_d_padding_mask,
+    _DataType *p_d_output, const QuantBertWeight<OpType_> &tw,
+    cudaStream_t stream, cublasHandle_t hd, const int *p_d_lang_id)
+    : _max_batch_size(max_batch_size),
+      _p_d_token_id(p_d_token_id),
+      _p_d_padding_mask(p_d_padding_mask),
+      _p_d_output(p_d_output),
+      _p_d_lang_id(p_d_lang_id),
+      _tw(tw),
+      _stream(stream),
+      _hd(hd),
+      _p_d_src_emb_wei(tw.get_src_emb_wei()),
+      _p_d_enc_wei(tw.get_enc_wei()),
+      _fone((_DataType)1.f),
+      _fzero((_DataType)0.f),
+      _src_emb_clip_max(tw.get_src_emb_clip_max()),
+      _enc_clip_max(tw.get_enc_clip_max()),
+      _ione((int32_t)1),
+      _izero((int32_t)0),
+      _atten_scaler((_DataType)sqrt(1.f / tw._dim_per_head)),
+      _max_batch_dim(max_batch_size * tw._max_step * tw._hidden_size),
+      _max_thread_per_block(1024) {
+  CHECK_GPU_ERROR(cublasLtCreate(&_cublas_lt_handle));
+}
+
+/**
+Init the GPU memory pointer which point to
+  the memory buffer needed by encoder.
+These buffer are used during custom cuda kernel function,
+  find the corresponding function to see how these buffer are used
+*/
+template <OperationType OpType_>
+void QuantBertEncoder<OpType_>::init_buffer() {
+  std::cout << "encoder buffer init start" << std::endl;
+
+  _DataType *qkv_buf;
+  CHECK_GPU_ERROR(cudaMalloc(&qkv_buf, 3 * _max_batch_dim * sizeof(_DataType)));
+  _p_d_q = qkv_buf;
+  _p_d_k = qkv_buf + _max_batch_dim;
+  _p_d_v = qkv_buf + 2 * _max_batch_dim;
+
+  CHECK_GPU_ERROR(cudaMalloc(&_p_d_c, _max_batch_size * _tw._head_num *
+                                          _tw._max_step * _tw._max_step *
+                                          sizeof(_DataType)));
+
+  int max_batch_dim = _max_batch_size * _tw._max_step *
+                      std::max(_tw._inner_size, _tw._hidden_size * 3);
+  CHECK_GPU_ERROR(cudaMalloc(&_int8_ffn_in_buf, max_batch_dim));
+  CHECK_GPU_ERROR(
+      cudaMalloc(&_int32_ffn_out_buf, max_batch_dim * sizeof(int32_t)));
+  CHECK_GPU_ERROR(
+      cudaMalloc(&_int8_ffn_out_buf, max_batch_dim * sizeof(int8_t)));
+
+  CHECK_GPU_ERROR(
+      cudaMalloc(&_int8_p_d_src_emb_wei,
+                 _tw._src_vocab_size * _tw._hidden_size * sizeof(int8_t)));
+  quantize_weight(_p_d_src_emb_wei[0], _int8_p_d_src_emb_wei,
+                  _tw._src_vocab_size, _tw._hidden_size,
+                  _quant_range / _src_emb_clip_max, _stream, _cublas_lt_handle,
+                  kRowMajor);
+
+  _p_device_emb.push_back(nullptr);
+  _p_device_emb.push_back(
+      to_gpu(_p_d_src_emb_wei[1], _tw._max_step * _tw._hidden_size, _stream));
+  _p_device_emb.push_back(
+      to_gpu(_p_d_src_emb_wei[2], _tw._hidden_size, _stream));
+  _p_device_emb.push_back(
+      to_gpu(_p_d_src_emb_wei[3], _tw._hidden_size, _stream));
+  if (_tw._multilg_type != 0) {
+    _p_device_emb.push_back(
+        to_gpu(_p_d_src_emb_wei[4], _tw._hidden_size, _stream));
+  } else {
+    _p_device_emb.push_back(nullptr);
+  }
+
+  // prepare gpu memory for weight
+  _int8_p_d_enc_wei = std::vector<int8_t *>(_tw._n_enc_layer * 4);
+  _scaled_ffn2_colsum = std::vector<_DataType *>(_tw._n_enc_layer);
+  for (_layer_id = 0; _layer_id < _tw._n_enc_layer; _layer_id++) {
+    _weight_offset = _layer_id * _tw._weight_per_enc_layer;
+    CHECK_GPU_ERROR(cudaMalloc(&_int8_p_d_enc_wei[_layer_id * 4],
+                               _tw._hidden_size * 3 * _tw._hidden_size));
+    CHECK_GPU_ERROR(cudaMalloc(&_int8_p_d_enc_wei[_layer_id * 4 + 1],
+                               _tw._hidden_size * _tw._hidden_size));
+    CHECK_GPU_ERROR(cudaMalloc(&_int8_p_d_enc_wei[_layer_id * 4 + 2],
+                               _tw._hidden_size * _tw._inner_size));
+    CHECK_GPU_ERROR(cudaMalloc(&_int8_p_d_enc_wei[_layer_id * 4 + 3],
+                               _tw._inner_size * _tw._hidden_size));
+
+    _p_device_wei.push_back(
+        to_gpu(_p_d_enc_wei[_weight_offset], _tw._hidden_size, _stream));
+    _p_device_wei.push_back(
+        to_gpu(_p_d_enc_wei[_weight_offset + 1], _tw._hidden_size, _stream));
+    _p_device_wei.push_back(nullptr);
+    _p_device_wei.push_back(to_gpu(_p_d_enc_wei[_weight_offset + 3],
+                                   _tw._hidden_size * 3, _stream));
+    _p_device_wei.push_back(nullptr);
+    _p_device_wei.push_back(
+        to_gpu(_p_d_enc_wei[_weight_offset + 5], _tw._hidden_size, _stream));
+    _p_device_wei.push_back(
+        to_gpu(_p_d_enc_wei[_weight_offset + 6], _tw._hidden_size, _stream));
+    _p_device_wei.push_back(
+        to_gpu(_p_d_enc_wei[_weight_offset + 7], _tw._hidden_size, _stream));
+    _p_device_wei.push_back(nullptr);
+    _p_device_wei.push_back(
+        to_gpu(_p_d_enc_wei[_weight_offset + 9], _tw._inner_size, _stream));
+    _p_device_wei.push_back(nullptr);
+    _p_device_wei.push_back(
+        to_gpu(_p_d_enc_wei[_weight_offset + 11], _tw._hidden_size, _stream));
+
+    quantize_weight(_p_d_enc_wei[_weight_offset + 2],
+                    _int8_p_d_enc_wei[_layer_id * 4], _tw._hidden_size,
+                    _tw._hidden_size * 3,
+                    _quant_range / _enc_clip_max[_layer_id * 11], _stream,
+                    _cublas_lt_handle);
+
+    quantize_weight(_p_d_enc_wei[_weight_offset + 4],
+                    _int8_p_d_enc_wei[_layer_id * 4 + 1], _tw._hidden_size,
+                    _tw._hidden_size,
+                    _quant_range / _enc_clip_max[_layer_id * 11 + 1], _stream,
+                    _cublas_lt_handle);
+
+    quantize_weight(_p_d_enc_wei[_weight_offset + 8],
+                    _int8_p_d_enc_wei[_layer_id * 4 + 2], _tw._hidden_size,
+                    _tw._inner_size,
+                    _quant_range / _enc_clip_max[_layer_id * 11 + 2], _stream,
+                    _cublas_lt_handle);
+
+    quantize_weight(_p_d_enc_wei[_weight_offset + 10],
+                    _int8_p_d_enc_wei[_layer_id * 4 + 3], _tw._inner_size,
+                    _tw._hidden_size,
+                    _quant_range / _enc_clip_max[_layer_id * 11 + 3], _stream,
+                    _cublas_lt_handle);
+
+    if (_tw._use_gelu) {
+      _scaled_ffn2_colsum[_layer_id] = nullptr;
+    } else {
+      CHECK_GPU_ERROR(cudaMalloc(&_scaled_ffn2_colsum[_layer_id],
+                                 _tw._hidden_size * sizeof(_DataType)));
+      float relu_scale = _enc_clip_max[_layer_id * 11 + 7] / 2;
+      _DataType *temp;
+      int weight_size = _tw._inner_size * _tw._hidden_size;
+
+      CHECK_GPU_ERROR(cudaMalloc(&temp, weight_size * sizeof(_DataType)));
+      CHECK_GPU_ERROR(cudaMemcpyAsync(temp, _p_d_enc_wei[_weight_offset + 10],
+                                      weight_size * sizeof(_DataType),
+                                      cudaMemcpyHostToDevice, _stream));
+      launch_scaled_colsum(temp, _scaled_ffn2_colsum[_layer_id],
+                           _tw._inner_size, _tw._hidden_size, relu_scale,
+                           _stream);
+
+      CHECK_GPU_ERROR(cudaGetLastError());
+      CHECK_GPU_ERROR(cudaFree(temp));
+    }
+  }
+  std::cout << "encoder buffer init succeed" << std::endl;
+  return;
+}
+
+/**
+Some requirements needed by custom cuda kernel function
+*/
+template <OperationType OpType_>
+std::string QuantBertEncoder<OpType_>::check() {
+  // if (_max_thread_per_block < _tw._hidden_size) {
+  //   return "violate hidden_size <= max_thread_per_block";
+  // }
+  if (_tw._inner_size & 1) {
+    return "violate inner_size % 2 = 0";
+  }
+  if (_tw._dim_per_head & 1) {
+    return "violate dim_per_head % 2 = 0";
+  }
+  if (_tw._multilg_type == 0 && _p_d_src_emb_wei.size() != 4) {
+    return "violate p_d_src_emb_wei.size() = 4";
+  }
+  if (_tw._multilg_type != 0 && _p_d_src_emb_wei.size() != 5) {
+    return "violate p_d_src_emb_wei.size() = 5";
+  }
+  if (_p_d_enc_wei.size() != _tw._weight_per_enc_layer * _tw._n_enc_layer) {
+    return "violate p_d_enc_wei.size() = weight_per_enc_layer * n_enc_layer";
+  }
+  if (_tw._multilg_type != 0 && _p_d_lang_id == nullptr) {
+    return "lang id should not be null when multilg";
+  }
+  return "";
+}
+
+/**
+Encoder inference
+*/
+template <OperationType OpType_>
+void QuantBertEncoder<OpType_>::run_one_infer(int batch_size,
+                                              int batch_seq_len) {
+  if (batch_size > _max_batch_size) {
+    throw std::runtime_error("batch size of input greater than max_batch_size");
+  }
+  if (batch_seq_len > _tw._max_step) {
+    throw std::runtime_error("seq len of input greater than max_step");
+  }
+  /* ---step1. init--- */
+  _batch_size = batch_size;
+  _batch_seq_len = batch_seq_len;
+  _batch_token_num = batch_size * batch_seq_len;
+#ifdef DEBUG_RESULT
+  std::cout << "batch_size-" << batch_size << " batch_seq_len-" << batch_seq_len
+            << std::endl;
+  print_vec(_p_d_token_id, "batch_token_ids", batch_size * batch_seq_len);
+#endif
+
+  /* ---step2. encoder feedforward--- */
+  launch_enc_emb_i8I<_DataType>(
+      _int8_p_d_src_emb_wei, _p_device_emb[1], _p_d_token_id, _p_d_output,
+      _p_d_padding_mask, _tw._padding_id, batch_size, batch_seq_len,
+      _tw._hidden_size, _stream, _p_device_emb[4], _p_d_lang_id,
+      _tw._multilg_type, _src_emb_clip_max / _quant_range, false);
+#ifdef DEBUG_RESULT
+  for (int i = 0; i < _batch_size; i++) {       // batch_id
+    for (int j = 0; j < _batch_seq_len; j++) {  // token_id
+      std::cout << "emb out: token-" << j << std::endl;
+      print_vec(_p_d_output + i * _batch_seq_len * _tw._hidden_size +
+                    j * _tw._hidden_size,
+                "emb out", 10);
+    }
+  }  // not normal
+#endif
+  for (_layer_id = 0; _layer_id < _tw._n_enc_layer; _layer_id++) {
+    _weight_offset = _layer_id * _tw._weight_per_enc_layer;
+    self_attention();
+    ffn_add_norm();
+  }
+
+#ifdef DEBUG_RESULT
+  for (int i = 0; i < _batch_size; i++) {       // batch_id
+    for (int j = 0; j < _batch_seq_len; j++) {  // token_id
+      std::cout << "encoder output: token-" << j << std::endl;
+      print_vec(_p_d_output + i * _batch_seq_len * _tw._hidden_size +
+                    j * _tw._hidden_size,
+                "encoder_output", _tw._dim_per_head);
+    }
+  }  // not normal
+#endif
+  return;
+}
+
+/**
+Encoder self attention
+*/
+template <OperationType OpType_>
+void QuantBertEncoder<OpType_>::self_attention() {
+  /* ---step 0. layer_norm, add output_bias to "query"--- */
+  if (_layer_id == 0) {
+    ker_norm_layer_resual_i8O_launcher<_DataType>(
+        _batch_token_num, _tw._hidden_size, _stream, _p_d_output,
+        _int8_ffn_in_buf, _p_device_wei[_weight_offset],
+        _p_device_wei[_weight_offset + 1], _p_device_wei[_weight_offset + 5],
+        _max_thread_per_block, _quant_range / _enc_clip_max[_layer_id * 11 + 4],
+        _tw._is_post_ln, true);
+  }
+  CHECK_GPU_ERROR(cudaGetLastError());
+
+#ifdef DEBUG_RESULT
+  for (int i = 0; i < _batch_size; i++) {       // batch_id
+    for (int j = 0; j < _batch_seq_len; j++) {  // token_id
+      std::cout << "qkv_attn input: token-" << j << std::endl;
+      print_vec(_int8_ffn_in_buf + i * _batch_seq_len * _tw._hidden_size +
+                    j * _tw._hidden_size,
+                "qkv_attn input", 10);
+    }
+  }
+#endif
+
+  /* ---step 1. qkv = ori_q * qkv_wei + bias, and reshape qkv for multi-head
+   * gemm--- */
+  cublasLtMM_withAlgo_i8IO(
+      _int8_ffn_out_buf, 1, _batch_token_num, _tw._hidden_size * 3,
+      _tw._hidden_size, 0, 0, 0,
+      _enc_clip_max[_layer_id * 11] * _enc_clip_max[_layer_id * 11 + 4] /
+          (_enc_clip_max[_layer_id * 11 + 8] * _quant_range),
+      _int8_ffn_in_buf, _int8_p_d_enc_wei[_layer_id * 4], _cublas_lt_handle,
+      _stream, false);
+
+  // get q, k, v by split and reshape qkv
+  ker_arrange_encself_qkv_i8I_launcher<_DataType>(
+      _batch_token_num, _tw._hidden_size, _stream, _int8_ffn_out_buf,
+      _p_device_wei[_weight_offset + 3], _p_d_q, _max_batch_dim, _batch_seq_len,
+      _tw._dim_per_head, _tw._head_num, _max_thread_per_block,
+      _enc_clip_max[_layer_id * 11 + 8] / _quant_range, true);
+
+  /* ---step 2. correlation = q * k, perform softmax on correlation--- */
+  CHECK_GPU_ERROR(cublasGemmStridedBatchedEx(
+      _hd, CUBLAS_OP_T, CUBLAS_OP_N, _batch_seq_len, _batch_seq_len,
+      _tw._dim_per_head, &_atten_scaler, _p_d_k, _AType, _tw._dim_per_head,
+      _batch_seq_len * _tw._dim_per_head, _p_d_q, _BType, _tw._dim_per_head,
+      _batch_seq_len * _tw._dim_per_head, &_fzero, _p_d_c, _CType,
+      _batch_seq_len, _batch_seq_len * _batch_seq_len,
+      _batch_size * _tw._head_num, _computeType,
+      CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+  ker_correlation_softmax_encself_launcher<_DataType>(
+      _batch_size, _batch_seq_len, _tw._head_num, _stream, _p_d_c,
+      _p_d_padding_mask);
+
+  /* ---step 3. new_q = correlation * v--- */
+  CHECK_GPU_ERROR(cublasGemmStridedBatchedEx(
+      _hd, CUBLAS_OP_N, CUBLAS_OP_N, _tw._dim_per_head, _batch_seq_len,
+      _batch_seq_len, &_fone, _p_d_v, _AType, _tw._dim_per_head,
+      _batch_seq_len * _tw._dim_per_head, _p_d_c, _BType, _batch_seq_len,
+      _batch_seq_len * _batch_seq_len, &_fzero, _p_d_q, _CType,
+      _tw._dim_per_head, _batch_seq_len * _tw._dim_per_head,
+      _batch_size * _tw._head_num, _computeType,
+      CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+  // use v to save reshaped q, since they are in same size and v
+  // will not be use again before the next multi-head-attention
+  ker_arrange_atten_output_i8O_launcher<_DataType>(
+      _batch_token_num, _tw._hidden_size, _stream, _p_d_q, _int8_ffn_in_buf,
+      _batch_seq_len, _tw._dim_per_head, _tw._head_num, _max_thread_per_block,
+      _quant_range / _enc_clip_max[_layer_id * 11 + 5], true);
+
+#ifdef DEBUG_RESULT
+  for (int i = 0; i < _batch_size; i++) {       // batch_id
+    for (int j = 0; j < _batch_seq_len; j++) {  // token_id
+      std::cout << "out_attn input: token-" << j << std::endl;
+      print_vec(_int8_ffn_in_buf + i * _batch_seq_len * _tw._hidden_size +
+                    j * _tw._hidden_size,
+                "out_attn input", 10);
+    }
+  }
+#endif
+
+  /* ---step 4. new_q = ori_q + new_q * output_wei--- */
+  cublasLtMM_withAlgo_i8IO(
+      _int8_ffn_out_buf, 1, _batch_token_num, _tw._hidden_size,
+      _tw._hidden_size, 0, 0, 0,
+      _enc_clip_max[_layer_id * 11 + 1] * _enc_clip_max[_layer_id * 11 + 5] /
+          (_enc_clip_max[_layer_id * 11 + 9] * _quant_range),
+      _int8_ffn_in_buf, _int8_p_d_enc_wei[_layer_id * 4 + 1], _cublas_lt_handle,
+      _stream, false);
+
+#ifdef DEBUG_RESULT
+  for (int i = 0; i < _batch_size; i++) {       // batch_id
+    for (int j = 0; j < _batch_seq_len; j++) {  // token_id
+      std::cout << "attn_ln input: token-" << j << std::endl;
+      print_vec(_int8_ffn_out_buf + i * _batch_seq_len * _tw._hidden_size +
+                    j * _tw._hidden_size,
+                "attn_ln input", 10);
+    }
+  }
+#endif
+
+  ker_residual_bias_ln_i8I_i8O_launcher<_DataType>(
+      _int8_ffn_out_buf, _p_device_wei[_weight_offset + 6],
+      _p_device_wei[_weight_offset + 7], _p_device_wei[_weight_offset + 11],
+      _int8_ffn_in_buf, _p_d_output, _batch_token_num, _tw._hidden_size,
+      _enc_clip_max[_layer_id * 11 + 9] / _quant_range,
+      _quant_range / _enc_clip_max[_layer_id * 11 + 6], _max_thread_per_block,
+      _stream, _tw._is_post_ln, true, true);
+
+  return;
+}
+
+template <OperationType OpType_>
+void QuantBertEncoder<OpType_>::ffn_add_norm() {
+#ifdef DEBUG_RESULT
+  for (int i = 0; i < _batch_size; i++) {       // batch_id
+    for (int j = 0; j < _batch_seq_len; j++) {  // token_id
+      std::cout << "ffn1 input: token-" << j << std::endl;
+      print_vec(_int8_ffn_in_buf + i * _batch_seq_len * _tw._hidden_size +
+                    j * _tw._hidden_size,
+                "ffn1 input", 10);
+    }
+  }
+#endif
+
+  /* ---step 1. first ffn layer--- */
+  cublasLtMM_withAlgo_i8IO(
+      _int8_ffn_out_buf, 1, _batch_token_num, _tw._inner_size, _tw._hidden_size,
+      0, 0, 0,
+      _enc_clip_max[_layer_id * 11 + 2] * _enc_clip_max[_layer_id * 11 + 6] /
+          (_enc_clip_max[_layer_id * 11 + 10] * _quant_range),
+      _int8_ffn_in_buf, _int8_p_d_enc_wei[_layer_id * 4 + 2], _cublas_lt_handle,
+      _stream, false);
+
+  if (_tw._use_gelu) {
+    ker_bias_gelu_i8I_i8O_launcher<_DataType>(
+        _batch_token_num, _stream, _int8_ffn_out_buf, _int8_ffn_in_buf,
+        _p_device_wei[_weight_offset + 9], _tw._inner_size,
+        _enc_clip_max[_layer_id * 11 + 10] / _quant_range,
+        _quant_range / _enc_clip_max[_layer_id * 11 + 7], true, true);
+  } else {
+    ker_bias_relu_i8I_i8O_launcher<_DataType>(
+        _batch_token_num, _stream, _int8_ffn_out_buf, _int8_ffn_in_buf,
+        _p_device_wei[_weight_offset + 9], _tw._inner_size,
+        _enc_clip_max[_layer_id * 11 + 10] / _quant_range,
+        _quant_range / _enc_clip_max[_layer_id * 11 + 7],
+        _enc_clip_max[_layer_id * 11 + 7], true, true, true);
+  }
+
+#ifdef DEBUG_RESULT
+  for (int i = 0; i < _batch_size; i++) {       // batch_id
+    for (int j = 0; j < _batch_seq_len; j++) {  // token_id
+      std::cout << "ffn2 input: token-" << j << std::endl;
+      print_vec(_int8_ffn_in_buf + i * _batch_seq_len * _tw._inner_size +
+                    j * _tw._inner_size,
+                "ffn2 input", 10);
+    }
+  }
+#endif
+
+  /* ---step 2. second ffn layer--- */
+  cublasLtMM_withAlgo(_int32_ffn_out_buf, 1, _batch_token_num, _tw._hidden_size,
+                      _tw._inner_size, 0, 0, 0, _int8_ffn_in_buf,
+                      _int8_p_d_enc_wei[_layer_id * 4 + 3], _cublas_lt_handle,
+                      _stream, false);
+
+  const _DataType *scale_ptr, *bias_ptr, *res_bias_ptr;
+  float clip_max, dequant_scale;
+  if (_tw._use_gelu) {
+    dequant_scale = _enc_clip_max[_layer_id * 11 + 3] *
+                    _enc_clip_max[_layer_id * 11 + 7] /
+                    (_quant_range * _quant_range);
+  } else {
+    dequant_scale = _enc_clip_max[_layer_id * 11 + 3] *
+                    _enc_clip_max[_layer_id * 11 + 7] /
+                    (2 * _quant_range * _quant_range);
+  }
+  if (_layer_id == _tw._n_enc_layer - 1) {
+    scale_ptr = _p_device_emb[2];
+    bias_ptr = _p_device_emb[3];
+
+    ker_residual_bias_ln_i32I_launcher<_DataType>(
+        _int32_ffn_out_buf, scale_ptr, bias_ptr, _p_d_output, _p_d_output,
+        _batch_token_num, _tw._hidden_size, dequant_scale,
+        _max_thread_per_block, _stream, true, _scaled_ffn2_colsum[_layer_id]);
+  } else {
+    scale_ptr = _p_device_wei[(_layer_id + 1) * _tw._weight_per_enc_layer];
+    bias_ptr = _p_device_wei[(_layer_id + 1) * _tw._weight_per_enc_layer + 1];
+    res_bias_ptr =
+        _p_device_wei[(_layer_id + 1) * _tw._weight_per_enc_layer + 5];
+    clip_max = _enc_clip_max[(_layer_id + 1) * 11 + 4];
+
+    ker_residual_bias_ln_i32I_i8O_launcher<_DataType>(
+        _int32_ffn_out_buf, scale_ptr, bias_ptr, res_bias_ptr, _int8_ffn_in_buf,
+        _p_d_output, _batch_token_num, _tw._hidden_size, dequant_scale,
+        _quant_range / clip_max, _max_thread_per_block, _stream,
+        _tw._is_post_ln, true, true, _scaled_ffn2_colsum[_layer_id]);
+
+#ifdef DEBUG_RESULT
+    for (int i = 0; i < _batch_size; i++) {       // batch_id
+      for (int j = 0; j < _batch_seq_len; j++) {  // token_id
+        std::cout << "encoder layer out: token-" << j << std::endl;
+        print_vec(_int8_ffn_in_buf + i * _batch_seq_len * _tw._hidden_size +
+                      j * _tw._hidden_size,
+                  "encoder layer out", 10);
+      }
+    }
+#endif
+  }
+
+  return;
+}
+
+template class QuantBertEncoder<OperationType::FP16>;
+template class QuantBertEncoder<OperationType::FP32>;
+
+}  // namespace cuda
+}  // namespace lightseq
diff --git a/lightseq/inference/model/quant_bert_encoder.h b/lightseq/inference/model/quant_bert_encoder.h
new file mode 100644
index 00000000..e8432a9d
--- /dev/null
+++ b/lightseq/inference/model/quant_bert_encoder.h
@@ -0,0 +1,109 @@
+#pragma once
+
+#include <cublas_v2.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <thrust/functional.h>
+#include <thrust/sequence.h>
+#include <cublasLt.h>
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <iostream>
+#include <string>
+
+#include "../proto/quant_bert_weight.h"
+#include "../tools/util.h"
+
+/**
+@file
+QuantBert encoder, composed by gemm lib and
+  custom cuda kernel function
+*/
+
+namespace lightseq {
+namespace cuda {
+
+template <OperationType OpType_>
+class QuantBertEncoder {
+ private:
+  typedef OperationTypeTraits<OpType_> _optraits;
+  typedef typename _optraits::DataType _DataType;
+  const cudaDataType_t _computeType = _optraits::computeType;
+  const cudaDataType_t _AType = _optraits::AType;
+  const cudaDataType_t _BType = _optraits::BType;
+  const cudaDataType_t _CType = _optraits::CType;
+
+  // private member function
+  void self_attention();
+  void ffn_add_norm();
+
+  const int _max_batch_size;
+  int *_p_d_padding_mask;  // true sequence length(remove padding), [batch_size]
+
+  const int *_p_d_lang_id;
+  const QuantBertWeight<OpType_> &_tw;
+  cudaStream_t _stream;
+  cublasHandle_t _hd;
+  cublasLtHandle_t _cublas_lt_handle;
+  const _DataType _fone;
+  const _DataType _fzero;
+  const int32_t _ione;
+  const int32_t _izero;
+  const _DataType _atten_scaler;
+  const int _max_batch_dim;
+  const int _max_thread_per_block;
+
+  _DataType *_p_d_qkv_projected;
+  _DataType *_p_d_q;
+  _DataType *_p_d_k;
+  _DataType *_p_d_v;
+  _DataType *_p_d_c;
+  _DataType *_p_d_ffn_buf1;
+  _DataType *_p_d_ffn_buf2;
+
+  int8_t *_int8_ffn_in_buf;
+  int32_t *_int32_ffn_out_buf;
+  int8_t *_int8_ffn_out_buf;
+
+  // {token_emb, pos_emb, norm_scale, norm_bias}
+  const std::vector<const _DataType *> &_p_d_src_emb_wei;
+  // {multihead_norm_scale, multihead_norm_bias, multihead_qkv_kernel,
+  // multihead_qkv_bias multihead_output_kernel, multihead_output_bias
+  // ffn_norm_scale, ffn_norm_bias}
+  // ffn_first_kernel, ffn_first_bias, ffn_second_kernel, ffn_second_bias} *
+  // encoder_layer_num
+  const std::vector<const _DataType *> &_p_d_enc_wei;
+  std::vector<const _DataType *> _p_device_wei;
+  std::vector<const _DataType *> _p_device_emb;
+
+  std::vector<int8_t *> _int8_p_d_enc_wei;
+  int8_t *_int8_p_d_src_emb_wei;
+  const float _quant_range = 127;
+  const float _src_emb_clip_max;
+  const std::vector<float> _enc_clip_max;  // size: 11 * enc_layer_num
+  std::vector<_DataType *> _scaled_ffn2_colsum;
+
+  int _batch_size;
+  int _batch_seq_len;
+  int _batch_token_num;
+  int _layer_id;
+  int _weight_offset;
+
+ public:
+  const int *_p_d_token_id;  // input token id [batch_size, batch_seq_len]
+  _DataType
+      *_p_d_output;  // encoder output, [batch_size, batch_seq_len, hidden_size]
+
+  QuantBertEncoder(int max_batch_size, const int *p_d_token_id,
+                   int *p_d_padding_mask, _DataType *p_d_output,
+                   const QuantBertWeight<OpType_> &tw, cudaStream_t stream,
+                   cublasHandle_t hd, const int *p_d_lang_id = nullptr);
+  void init_buffer();
+  std::string check();
+  void run_one_infer(int batch_size, int batch_seq_len);
+};
+
+}  // namespace cuda
+}  // namespace lightseq
diff --git a/lightseq/inference/model/quant_decoder.cc.cu b/lightseq/inference/model/quant_decoder.cc.cu
index 1672d34f..9bc833ad 100644
--- a/lightseq/inference/model/quant_decoder.cc.cu
+++ b/lightseq/inference/model/quant_decoder.cc.cu
@@ -7,7 +7,7 @@
 
 /**
 @file
-Transformer decoder, composed by gemm lib and
+QuantTransformer decoder, composed by gemm lib and
   custom cuda kernel function
 */
 
@@ -70,15 +70,6 @@ QuantDecoder<OpType_>::QuantDecoder(int max_batch_size,
   return;
 }
 
-/**
-Compute GPU memory size needed by transformer decoder,
-  to see how these memory is used, checkout init_buffer() for detail
-*/
-template <OperationType OpType_>
-long QuantDecoder<OpType_>::compute_buffer_bytesize() {
-  return 0;
-}
-
 /**
 Init the GPU memory pointer which point to
   the memory buffer needed by decoder.
@@ -573,7 +564,7 @@ void QuantDecoder<OpType_>::embedding() {
       _p_device_emb[7], _p_d_lang_id, _p_d_cur_step_query, _batch_size,
       _tw._beam_size, _tw._hidden_size, _tw._trg_vocab_size, _cur_step,
       _tw._max_step, _tw._multilg_type, _stream,
-      _trg_emb_clip_max / _quant_range);
+      _trg_emb_clip_max / _quant_range, true);
 #ifdef DEBUG_RESULT
   for (int i = 0; i < _batch_size; i++) {       // batch_id
     for (int j = 0; j < _tw._beam_size; j++) {  // beam_id
@@ -647,7 +638,7 @@ void QuantDecoder<OpType_>::self_attention() {
 
   // get q, k, v by split and reshape qkv
 
-  ker_arrange_decself_qkv_i8I_launcher<_DataType>(
+  ker_arrange_decself_qkv_i8I_i8O_launcher<_DataType>(
       _step_token_num, _tw._hidden_size, _stream, _int8_ffn_out_buf,
       _p_device_wei[_weight_offset + 3], _int8_ffn_in_buf,
       _p_d_self_k_cache1[_layer_id], _p_d_self_v_cache1[_layer_id],
@@ -680,7 +671,7 @@ void QuantDecoder<OpType_>::self_attention() {
   CHECK_GPU_ERROR(cudaGetLastError());
 #endif
 
-  ker_fuse_softmax_new_value_int8_launcher(
+  ker_fuse_softmax_new_value_i32I_i8O_launcher(
       _int32_ffn_out_buf, _p_d_self_v_cache1[_layer_id], _int8_ffn_in_buf,
       _step_token_num * _tw._head_num, _cur_step + 1, _tw._max_step,
       _tw._head_num, _tw._dim_per_head, float(_atten_scaler),
@@ -849,7 +840,7 @@ void QuantDecoder<OpType_>::ffn_add_norm() {
         _step_token_num, _stream, _int8_ffn_out_buf, _int8_ffn_in_buf,
         _p_device_wei[_weight_offset + 15], _tw._inner_size,
         _dec_clip_max[_layer_id * 19 + 16] / _quant_range,
-        _quant_range / _dec_clip_max[_layer_id * 19 + 11], true);
+        _quant_range / _dec_clip_max[_layer_id * 19 + 11], true, false);
   } else {
     ker_bias_relu_i8I_i8O_launcher<_DataType>(
         _step_token_num, _stream, _int8_ffn_out_buf, _int8_ffn_in_buf,
@@ -871,7 +862,16 @@ void QuantDecoder<OpType_>::ffn_add_norm() {
                 _tw._inner_size, 0, 0, 0, 1, _cublas_lt_handle, _stream);
 
   const _DataType *scale_ptr, *bias_ptr, *res_bias_ptr;
-  float clip_max;
+  float clip_max, dequant_scale;
+  if (_tw._use_gelu) {
+    dequant_scale = _dec_clip_max[_layer_id * 19 + 5] *
+                    _dec_clip_max[_layer_id * 19 + 11] /
+                    (_quant_range * _quant_range);
+  } else {
+    dequant_scale = _dec_clip_max[_layer_id * 19 + 5] *
+                    _dec_clip_max[_layer_id * 19 + 11] /
+                    (2 * _quant_range * _quant_range);
+  }
   if (_layer_id == _tw._n_dec_layer - 1) {
     scale_ptr = _p_device_emb[2];
     bias_ptr = _p_device_emb[3];
@@ -887,9 +887,7 @@ void QuantDecoder<OpType_>::ffn_add_norm() {
 
   ker_residual_bias_ln_i32I_i8O_launcher<_DataType>(
       _int32_ffn_out_buf, scale_ptr, bias_ptr, res_bias_ptr, _int8_ffn_in_buf,
-      _p_d_cur_step_query, _step_token_num, _tw._hidden_size,
-      _dec_clip_max[_layer_id * 19 + 5] * _dec_clip_max[_layer_id * 19 + 11] /
-          (2 * _quant_range * _quant_range),
+      _p_d_cur_step_query, _step_token_num, _tw._hidden_size, dequant_scale,
       _quant_range / clip_max, _max_thread_per_block, _stream, _tw._is_post_ln,
       false, true, _scaled_ffn2_colsum[_layer_id]);
 
@@ -906,22 +904,23 @@ void QuantDecoder<OpType_>::ffn_add_norm() {
 
 template <OperationType OpType_>
 bool QuantDecoder<OpType_>::sample() {
-  throw std::runtime_error("QuantDecoder sample() not implemented");
   CHECK_GPU_ERROR(
       cudaMemsetAsync(_p_d_sample_unfinished, 0, sizeof(int), _stream));
   /* --- Sample new tokens from logits --- */
   if (_tw._sampling_method == "topk") {
-    ker_topk_sample_launcher<_DataType>(
+    ker_topk_sample_i8I_launcher<_DataType>(
         _batch_size, (_cur_step + 1), _tw._max_step, 1, _max_thread_per_block,
-        _stream, _p_d_logit_buf, _p_device_emb[6], _p_d_alive_seq,
+        _stream, _int8_ffn_out_buf, _p_device_emb[6], _p_d_alive_seq,
         _p_d_alive_seq_buf, _tw._trg_vocab_size, _tw._topk,
-        _p_d_sample_unfinished, _p_d_curandstate, _tw._end_id);
+        _p_d_sample_unfinished, _p_d_curandstate, _tw._end_id,
+        _logits_clip_max / _quant_range, true);
   } else {
-    ker_topp_sample_launcher<_DataType>(
+    ker_topp_sample_i8I_launcher<_DataType>(
         _batch_size, (_cur_step + 1), _tw._max_step, 1, _max_thread_per_block,
-        _stream, _p_d_logit_buf, _p_device_emb[6], _p_d_alive_seq,
+        _stream, _int8_ffn_out_buf, _p_device_emb[6], _p_d_alive_seq,
         _p_d_alive_seq_buf, _tw._trg_vocab_size, _tw._topp,
-        _p_d_sample_unfinished, _p_d_curandstate, _tw._end_id);
+        _p_d_sample_unfinished, _p_d_curandstate, _tw._end_id,
+        _logits_clip_max / _quant_range, true);
   }
 #ifdef DEBUG_RESULT
   print_vec(_p_d_sample_unfinished, "unfinished flag", 1);
@@ -1054,7 +1053,6 @@ void QuantDecoder<OpType_>::update_new_seq_probs() {
 
 template <OperationType OpType_>
 bool QuantDecoder<OpType_>::topk_greedy_search() {
-  throw std::runtime_error("QuantDecoder topk_greedy_search() not implemented");
   _tw._diverse_lambda = 0;
   if (_cur_step == 0) {
     return beam_search();
@@ -1063,11 +1061,11 @@ bool QuantDecoder<OpType_>::topk_greedy_search() {
   CHECK_GPU_ERROR(
       cudaMemsetAsync(_p_d_sample_unfinished, 0, sizeof(int), _stream));
   /* --- Sample new tokens from logits --- */
-  ker_topk_sample_launcher<_DataType>(
+  ker_topk_sample_i8I_launcher<_DataType>(
       _step_token_num, (_cur_step + 1), _tw._max_step, 1, _max_thread_per_block,
-      _stream, _p_d_logit_buf, _p_device_emb[6], _p_d_alive_seq,
+      _stream, _int8_ffn_out_buf, _p_device_emb[6], _p_d_alive_seq,
       _p_d_alive_seq_buf, _tw._trg_vocab_size, 1, _p_d_sample_unfinished,
-      _p_d_curandstate, _tw._end_id);
+      _p_d_curandstate, _tw._end_id, _logits_clip_max / _quant_range, true);
 
 #ifdef DEBUG_RESULT
   print_vec(_p_d_sample_unfinished, "unfinished flag", 1);
diff --git a/lightseq/inference/model/quant_decoder.h b/lightseq/inference/model/quant_decoder.h
index 63682766..9274e0fb 100644
--- a/lightseq/inference/model/quant_decoder.h
+++ b/lightseq/inference/model/quant_decoder.h
@@ -20,7 +20,7 @@
 
 /**
 @file
-Transformer decoder, composed by gemm lib and
+QuantTransformer decoder, composed by gemm lib and
   custom cuda kernel function
 */
 namespace lightseq {
@@ -101,7 +101,6 @@ class QuantDecoder {
   _DataType* _p_d_query_buf2;
   _DataType* _p_d_c;
   _DataType* _p_d_encoder_out_buf;
-  _DataType* _p_d_logit_buf;
 
   int8_t* _int8_ffn_in_buf;
   int32_t* _int32_ffn_out_buf;
@@ -159,7 +158,6 @@ class QuantDecoder {
                QuantTransformerWeight<OpType_>& tw, cudaStream_t stream,
                cublasHandle_t hd, bool output_topk = false,
                const int* p_d_lang_id = nullptr);
-  long compute_buffer_bytesize();
   void init_buffer();
   std::string check();
   void run_one_infer(int batch_size, int batch_seq_len);
diff --git a/lightseq/inference/model/quant_encoder.cc.cu b/lightseq/inference/model/quant_encoder.cc.cu
index 3f9d2b9d..075bccf9 100644
--- a/lightseq/inference/model/quant_encoder.cc.cu
+++ b/lightseq/inference/model/quant_encoder.cc.cu
@@ -7,7 +7,7 @@
 
 /**
 @file
-Transformer encoder, composed by gemm lib and
+QuantTransformer encoder, composed by gemm lib and
   custom cuda kernel function
 */
 
@@ -45,19 +45,6 @@ QuantEncoder<OpType_>::QuantEncoder(int max_batch_size, int *p_d_token_id,
   CHECK_GPU_ERROR(cublasLtCreate(&_cublas_lt_handle));
 }
 
-/**
-Compute GPU memory size needed by transformer encoder,
-  to see how these memory is used, checkout init_buffer() for detail
-*/
-template <OperationType OpType_>
-long QuantEncoder<OpType_>::compute_buffer_bytesize() {
-  // long sz1 = _max_batch_dim * 6 +
-  //            _max_batch_size * _tw._head_num * _tw._max_step * _tw._max_step;
-  // long sz2 = _max_batch_dim + _max_batch_size * _tw._max_step *
-  // _tw._inner_size; return max(sz1, sz2) * sizeof(_DataType);
-  return 0;
-}
-
 /**
 Init the GPU memory pointer which point to
   the memory buffer needed by encoder.
@@ -89,9 +76,10 @@ void QuantEncoder<OpType_>::init_buffer() {
   CHECK_GPU_ERROR(
       cudaMalloc(&_int8_p_d_src_emb_wei,
                  _tw._src_vocab_size * _tw._hidden_size * sizeof(int8_t)));
-  quantize_weight(_p_d_src_emb_wei[0], _int8_p_d_src_emb_wei, _tw._hidden_size,
-                  _tw._src_vocab_size, _quant_range / _src_emb_clip_max,
-                  _stream, _cublas_lt_handle, kRowMajor);
+  quantize_weight(_p_d_src_emb_wei[0], _int8_p_d_src_emb_wei,
+                  _tw._src_vocab_size, _tw._hidden_size,
+                  _quant_range / _src_emb_clip_max, _stream, _cublas_lt_handle,
+                  kRowMajor);
 
   _p_device_emb.push_back(nullptr);
   _p_device_emb.push_back(
@@ -247,7 +235,7 @@ void QuantEncoder<OpType_>::run_one_infer(int batch_size, int batch_seq_len) {
       _int8_p_d_src_emb_wei, _p_device_emb[1], _p_d_token_id, _p_d_output,
       _p_d_padding_mask, _tw._padding_id, batch_size, batch_seq_len,
       _tw._hidden_size, _stream, _p_device_emb[4], _p_d_lang_id,
-      _tw._multilg_type, _src_emb_clip_max / _quant_range);
+      _tw._multilg_type, _src_emb_clip_max / _quant_range, true);
 #ifdef DEBUG_RESULT
   for (int i = 0; i < _batch_size; i++) {       // batch_id
     for (int j = 0; j < _batch_seq_len; j++) {  // token_id
@@ -356,7 +344,7 @@ void QuantEncoder<OpType_>::self_attention() {
       _int8_ffn_in_buf, _p_d_output, _batch_token_num, _tw._hidden_size,
       _enc_clip_max[_layer_id * 12 + 9] / _quant_range,
       _quant_range / _enc_clip_max[_layer_id * 12 + 6], _max_thread_per_block,
-      _stream, _tw._is_post_ln, true);
+      _stream, _tw._is_post_ln, true, true);
 
   return;
 }
@@ -376,7 +364,7 @@ void QuantEncoder<OpType_>::ffn_add_norm() {
         _batch_token_num, _stream, _int8_ffn_out_buf, _int8_ffn_in_buf,
         _p_device_wei[_weight_offset + 9], _tw._inner_size,
         _enc_clip_max[_layer_id * 12 + 10] / _quant_range,
-        _quant_range / _enc_clip_max[_layer_id * 12 + 7], true);
+        _quant_range / _enc_clip_max[_layer_id * 12 + 7], true, true);
   } else {
     ker_bias_relu_i8I_i8O_launcher<_DataType>(
         _batch_token_num, _stream, _int8_ffn_out_buf, _int8_ffn_in_buf,
@@ -393,16 +381,23 @@ void QuantEncoder<OpType_>::ffn_add_norm() {
                       _stream, false);
 
   const _DataType *scale_ptr, *bias_ptr, *res_bias_ptr;
-  float clip_max;
+  float clip_max, dequant_scale;
+  if (_tw._use_gelu) {
+    dequant_scale = _enc_clip_max[_layer_id * 12 + 3] *
+                    _enc_clip_max[_layer_id * 12 + 7] /
+                    (_quant_range * _quant_range);
+  } else {
+    dequant_scale = _enc_clip_max[_layer_id * 12 + 3] *
+                    _enc_clip_max[_layer_id * 12 + 7] /
+                    (2 * _quant_range * _quant_range);
+  }
   if (_layer_id == _tw._n_enc_layer - 1) {
     scale_ptr = _p_device_emb[2];
     bias_ptr = _p_device_emb[3];
 
     ker_residual_bias_ln_i32I_launcher<_DataType>(
         _int32_ffn_out_buf, scale_ptr, bias_ptr, _p_d_output, _p_d_output,
-        _batch_token_num, _tw._hidden_size,
-        _enc_clip_max[_layer_id * 12 + 3] * _enc_clip_max[_layer_id * 12 + 7] /
-            (2 * _quant_range * _quant_range),
+        _batch_token_num, _tw._hidden_size, dequant_scale,
         _max_thread_per_block, _stream, true, _scaled_ffn2_colsum[_layer_id]);
   } else {
     scale_ptr = _p_device_wei[(_layer_id + 1) * _tw._weight_per_enc_layer];
@@ -413,9 +408,7 @@ void QuantEncoder<OpType_>::ffn_add_norm() {
 
     ker_residual_bias_ln_i32I_i8O_launcher<_DataType>(
         _int32_ffn_out_buf, scale_ptr, bias_ptr, res_bias_ptr, _int8_ffn_in_buf,
-        _p_d_output, _batch_token_num, _tw._hidden_size,
-        _enc_clip_max[_layer_id * 12 + 3] * _enc_clip_max[_layer_id * 12 + 7] /
-            (2 * _quant_range * _quant_range),
+        _p_d_output, _batch_token_num, _tw._hidden_size, dequant_scale,
         _quant_range / clip_max, _max_thread_per_block, _stream,
         _tw._is_post_ln, true, true, _scaled_ffn2_colsum[_layer_id]);
   }
diff --git a/lightseq/inference/model/quant_encoder.h b/lightseq/inference/model/quant_encoder.h
index d14f3fd0..0d77114b 100644
--- a/lightseq/inference/model/quant_encoder.h
+++ b/lightseq/inference/model/quant_encoder.h
@@ -18,7 +18,7 @@
 
 /**
 @file
-Transformer decoder, composed by gemm lib and
+QuantTransformer encoder, composed by gemm lib and
   custom cuda kernel function
 */
 
@@ -99,7 +99,6 @@ class QuantEncoder {
                _DataType *p_d_output, const QuantTransformerWeight<OpType_> &tw,
                cudaStream_t stream, cublasHandle_t hd,
                const int *p_d_lang_id = nullptr);
-  long compute_buffer_bytesize();
   void init_buffer();
   std::string check();
   void run_one_infer(int batch_size, int batch_seq_len);
diff --git a/lightseq/inference/model/quant_gpt_encoder.cc.cu b/lightseq/inference/model/quant_gpt_encoder.cc.cu
new file mode 100644
index 00000000..26f1b5e8
--- /dev/null
+++ b/lightseq/inference/model/quant_gpt_encoder.cc.cu
@@ -0,0 +1,769 @@
+#include "../kernels/gptKernels.h"
+#include "../kernels/gptKernels_int8.h"
+#include "../kernels/transformerKernels.h"
+#include "../kernels/transformerKernels_int8.h"
+#include "quant_gpt_encoder.h"
+#include "cublas_helper.h"
+
+/**
+@file
+QuantGPT encoder, composed by gemm lib and
+  custom cuda kernel function
+*/
+
+// #define DEBUG_RESULT
+
+namespace lightseq {
+namespace cuda {
+
+template <OperationType OpType_>
+QuantGptEncoder<OpType_>::QuantGptEncoder(
+    int max_batch_size, const int *p_d_token_id, float *p_d_ppl,
+    int *p_d_sample_id, const QuantGptWeight<OpType_> &tw, cudaStream_t stream,
+    cudaStream_t cache_stream, cublasHandle_t hd)
+    : _max_batch_size(max_batch_size),
+      _p_d_token_id(p_d_token_id),
+      _p_d_ppl(p_d_ppl),
+      _p_d_sample_id(p_d_sample_id),
+      _tw(tw),
+      _stream(stream),
+      _cache_stream(cache_stream),
+      _hd(hd),
+      _p_d_src_emb_wei(tw.get_src_emb_wei()),
+      _p_d_enc_wei(tw.get_enc_wei()),
+      _fone((_DataType)1.f),
+      _fzero((_DataType)0.f),
+      _src_emb_clip_max(tw.get_src_emb_clip_max()),
+      _output_ln_clip_max(tw.get_output_ln_clip_max()),
+      _logits_clip_max(tw.get_logits_clip_max()),
+      _enc_clip_max(tw.get_enc_clip_max()),
+      _ione((int32_t)1),
+      _izero((int32_t)0),
+      _atten_scaler((_DataType)sqrt(1.f / tw._dim_per_head)),
+      _max_batch_dim(max_batch_size * tw._max_step * tw._hidden_size),
+      _max_thread_per_block(1024),
+      _h_real_seq_len(max_batch_size, 0),
+      _h_ppl(max_batch_size, 0.f),
+      _h_sample_id(max_batch_size * tw._max_step, 0),
+      _h_unfinished(1) {
+  CHECK_GPU_ERROR(cublasLtCreate(&_cublas_lt_handle));
+}
+
+/**
+Init the GPU memory pointer which point to
+  the memory buffer needed by encoder.
+These buffer are used during custom cuda kernel function,
+  find the corresponding function to see how these buffer are used
+*/
+template <OperationType OpType_>
+void QuantGptEncoder<OpType_>::init_buffer() {
+  CHECK_GPU_ERROR(
+      cudaMalloc(&_p_d_real_seq_len, _max_batch_size * sizeof(int)));
+  CHECK_GPU_ERROR(cudaMalloc(&_p_d_query, _max_batch_dim * sizeof(_DataType)));
+  CHECK_GPU_ERROR(cudaMalloc((void **)&_p_d_curandstate,
+                             _max_batch_size * sizeof(curandState)));
+  CHECK_GPU_ERROR(cudaMalloc((void **)&_p_d_sample_id_buf,
+                             _max_batch_size * _tw._max_step * sizeof(int)));
+  CHECK_GPU_ERROR(cudaMalloc((void **)&_p_d_unfinished, sizeof(int)));
+  ker_curand_setup<<<_max_batch_size, 1, 0, _stream>>>(_p_d_curandstate);
+
+  _DataType *qkv_buf;
+  CHECK_GPU_ERROR(cudaMalloc(&qkv_buf, 3 * _max_batch_dim * sizeof(_DataType)));
+  _p_d_q = qkv_buf;
+  _p_d_k = qkv_buf + _max_batch_dim;
+  _p_d_v = qkv_buf + 2 * _max_batch_dim;
+
+  int max_attn_score_dim = round_up(
+      _max_batch_size * _tw._head_num * _tw._max_step * _tw._max_step, 32);
+
+  CHECK_GPU_ERROR(cudaMalloc(&_p_d_c, max_attn_score_dim * sizeof(_DataType)));
+
+  int max_batch_dim =
+      _max_batch_size * _tw._max_step *
+      round_up(std::max(_tw._inner_size, _tw._hidden_size * 3), 32);
+  CHECK_GPU_ERROR(
+      cudaMalloc(&_int8_ffn_in_buf, max_batch_dim * sizeof(int8_t)));
+  CHECK_GPU_ERROR(cudaMalloc(
+      &_int32_ffn_out_buf,
+      std::max(max_batch_dim, max_attn_score_dim) * sizeof(int32_t)));
+  CHECK_GPU_ERROR(
+      cudaMalloc(&_int8_ffn_out_buf,
+                 std::max(max_batch_dim, round_up(_tw._src_vocab_size, 32) *
+                                             _tw._max_step * _max_batch_size) *
+                     sizeof(int8_t)));
+
+  // malloc embeddings
+  CHECK_GPU_ERROR(
+      cudaMalloc(&_int8_p_d_src_emb_wei,
+                 _tw._src_vocab_size * _tw._hidden_size * sizeof(int8_t)));
+  quantize_weight(_p_d_src_emb_wei[0], _int8_p_d_src_emb_wei, _tw._hidden_size,
+                  _tw._src_vocab_size, _quant_range / _src_emb_clip_max,
+                  _stream, _cublas_lt_handle);
+  CHECK_GPU_ERROR(
+      cudaMalloc(&_int8_p_d_src_emb_bottom_wei,
+                 _tw._src_vocab_size * _tw._hidden_size * sizeof(int8_t)));
+  quantize_weight(_p_d_src_emb_wei[0], _int8_p_d_src_emb_bottom_wei,
+                  _tw._hidden_size, _tw._src_vocab_size,
+                  _quant_range / _src_emb_clip_max, _stream, _cublas_lt_handle,
+                  kColMajor);
+  _p_device_emb.push_back(nullptr);
+  _p_device_emb.push_back(
+      to_gpu(_p_d_src_emb_wei[1], _tw._max_step * _tw._hidden_size, _stream));
+  _p_device_emb.push_back(
+      to_gpu(_p_d_src_emb_wei[2], _tw._hidden_size, _stream));
+  _p_device_emb.push_back(
+      to_gpu(_p_d_src_emb_wei[3], _tw._hidden_size, _stream));
+
+  // malloc reused kv cache max size: _tw._hidden_size * 2 * _tw._n_enc_layer *
+  // _max_batch_size * _max_step * sizeof(T)
+  int8_t *self_kv_cache_buffer;
+  int8_t *sliding_p;
+  CHECK_GPU_ERROR(
+      cudaMalloc(&self_kv_cache_buffer,
+                 _max_batch_dim * _tw._n_enc_layer * 4 * sizeof(int8_t)));
+
+  sliding_p = self_kv_cache_buffer;
+  for (int i = 0; i < _tw._n_enc_layer * 2; i++) {
+    _p_d_self_k_cache.push_back(sliding_p);
+    sliding_p += _max_batch_dim;
+  }
+  for (int i = 0; i < _tw._n_enc_layer * 2; i++) {
+    _p_d_self_v_cache.push_back(sliding_p);
+    sliding_p += _max_batch_dim;
+  }
+  _p_d_self_k_cache1 = _p_d_self_k_cache.data();
+  _p_d_self_k_cache2 = _p_d_self_k_cache.data() + _tw._n_enc_layer;
+  _p_d_self_v_cache1 = _p_d_self_v_cache.data();
+  _p_d_self_v_cache2 = _p_d_self_v_cache.data() + _tw._n_enc_layer;
+
+  // malloc weights
+  _int8_p_d_enc_wei = std::vector<int8_t *>(_tw._n_enc_layer * 4);
+  _scaled_ffn2_colsum = std::vector<_DataType *>(_tw._n_enc_layer);
+  for (_layer_id = 0; _layer_id < _tw._n_enc_layer; _layer_id++) {
+    _weight_offset = _layer_id * _tw._weight_per_enc_layer;
+    // malloc quantized weights
+    CHECK_GPU_ERROR(
+        cudaMalloc(&_int8_p_d_enc_wei[_layer_id * 4],
+                   _tw._hidden_size * 3 * _tw._hidden_size * sizeof(int8_t)));
+    CHECK_GPU_ERROR(
+        cudaMalloc(&_int8_p_d_enc_wei[_layer_id * 4 + 1],
+                   _tw._hidden_size * _tw._hidden_size * sizeof(int8_t)));
+    CHECK_GPU_ERROR(
+        cudaMalloc(&_int8_p_d_enc_wei[_layer_id * 4 + 2],
+                   _tw._hidden_size * _tw._inner_size * sizeof(int8_t)));
+    CHECK_GPU_ERROR(
+        cudaMalloc(&_int8_p_d_enc_wei[_layer_id * 4 + 3],
+                   _tw._inner_size * _tw._hidden_size * sizeof(int8_t)));
+
+    // malloc unquantized weights
+    _p_device_wei.push_back(
+        to_gpu(_p_d_enc_wei[_weight_offset], _tw._hidden_size, _stream));
+    _p_device_wei.push_back(
+        to_gpu(_p_d_enc_wei[_weight_offset + 1], _tw._hidden_size, _stream));
+    _p_device_wei.push_back(nullptr);
+    _p_device_wei.push_back(to_gpu(_p_d_enc_wei[_weight_offset + 3],
+                                   _tw._hidden_size * 3, _stream));
+    _p_device_wei.push_back(nullptr);
+    _p_device_wei.push_back(
+        to_gpu(_p_d_enc_wei[_weight_offset + 5], _tw._hidden_size, _stream));
+    _p_device_wei.push_back(
+        to_gpu(_p_d_enc_wei[_weight_offset + 6], _tw._hidden_size, _stream));
+    _p_device_wei.push_back(
+        to_gpu(_p_d_enc_wei[_weight_offset + 7], _tw._hidden_size, _stream));
+    _p_device_wei.push_back(nullptr);
+    _p_device_wei.push_back(
+        to_gpu(_p_d_enc_wei[_weight_offset + 9], _tw._inner_size, _stream));
+    _p_device_wei.push_back(nullptr);
+    _p_device_wei.push_back(
+        to_gpu(_p_d_enc_wei[_weight_offset + 11], _tw._hidden_size, _stream));
+
+    quantize_weight(_p_d_enc_wei[_weight_offset + 2],
+                    _int8_p_d_enc_wei[_layer_id * 4], _tw._hidden_size,
+                    _tw._hidden_size * 3,
+                    _quant_range / _enc_clip_max[_layer_id * 12], _stream,
+                    _cublas_lt_handle);
+
+    quantize_weight(_p_d_enc_wei[_weight_offset + 4],
+                    _int8_p_d_enc_wei[_layer_id * 4 + 1], _tw._hidden_size,
+                    _tw._hidden_size,
+                    _quant_range / _enc_clip_max[_layer_id * 12 + 1], _stream,
+                    _cublas_lt_handle, kColMajor);
+
+    quantize_weight(_p_d_enc_wei[_weight_offset + 8],
+                    _int8_p_d_enc_wei[_layer_id * 4 + 2], _tw._hidden_size,
+                    _tw._inner_size,
+                    _quant_range / _enc_clip_max[_layer_id * 12 + 2], _stream,
+                    _cublas_lt_handle);
+
+    quantize_weight(_p_d_enc_wei[_weight_offset + 10],
+                    _int8_p_d_enc_wei[_layer_id * 4 + 3], _tw._inner_size,
+                    _tw._hidden_size,
+                    _quant_range / _enc_clip_max[_layer_id * 12 + 3], _stream,
+                    _cublas_lt_handle, kColMajor);
+
+    _scaled_ffn2_colsum[_layer_id] = nullptr;
+  }
+
+  CHECK_GPU_ERROR(cudaStreamSynchronize(_stream));
+  CHECK_GPU_ERROR(cudaGetLastError());
+  std::cout << "quantized encoder buffer init succeed" << std::endl;
+
+  return;
+}
+
+/**
+Some requirements needed by custom cuda kernel function
+*/
+template <OperationType OpType_>
+std::string QuantGptEncoder<OpType_>::check() {
+  // if (_max_thread_per_block < _tw._hidden_size) {
+  //   return "violate hidden_size <= max_thread_per_block";
+  // }
+  if (_tw._inner_size & 1) {
+    return "violate inner_size % 2 = 0";
+  }
+  if (_tw._dim_per_head & 1) {
+    return "violate dim_per_head % 2 = 0";
+  }
+  if (_p_d_src_emb_wei.size() != 4) {
+    return "violate p_d_src_emb_wei.size() = 4";
+  }
+  if (_p_d_enc_wei.size() != _tw._weight_per_enc_layer * _tw._n_enc_layer) {
+    return "violate p_d_enc_wei.size() = weight_per_enc_layer * n_enc_layer";
+  }
+  std::string sampling_method = _tw._sampling_method;
+  if (kSamplingMethods.find(sampling_method) == kSamplingMethods.end()) {
+    return std::string("unsupported sampling_method: ") + sampling_method;
+  }
+
+  if (_tw._topk <= 0) {
+    return "topk must be positive";
+  }
+  if (_tw._topp <= 0 && _tw._topp >= 1.0) {
+    return "topp must be in (0, 1)";
+  }
+
+  return "";
+}
+
+template <OperationType OpType_>
+void QuantGptEncoder<OpType_>::run_one_infer(int batch_size,
+                                             int batch_seq_len) {
+  if (batch_size > _max_batch_size) {
+    throw std::runtime_error("batch size of input greater than max_batch_size");
+  }
+  if (batch_seq_len > _tw._max_step) {
+    throw std::runtime_error("seq len of input greater than max_step");
+  }
+  _batch_size = batch_size;
+  _batch_seq_len = batch_seq_len;
+  _batch_token_num = batch_size * batch_seq_len;
+  CHECK_GPU_ERROR(cudaMemcpyAsync(_p_d_real_seq_len, _h_real_seq_len.data(),
+                                  sizeof(int) * _batch_size,
+                                  cudaMemcpyHostToDevice, _stream));
+  CHECK_GPU_ERROR(cudaMemcpyAsync(_p_d_ppl, _h_ppl.data(),
+                                  sizeof(float) * _batch_size,
+                                  cudaMemcpyHostToDevice, _stream));
+
+#ifdef DEBUG_RESULT
+  std::cout << "batch_size-" << batch_size << " batch_seq_len-" << batch_seq_len
+            << std::endl;
+  print_vec(_p_d_token_id, "batch_token_ids", batch_size * batch_seq_len);
+#endif
+
+  // token embedding, add position embedding and layer_norm
+  ker_gpt_embedding_i8I_launcher<_DataType>(
+      batch_size, batch_seq_len, _tw._hidden_size, _stream,
+      _int8_p_d_src_emb_bottom_wei, _p_device_emb[1], _p_d_token_id, _p_d_query,
+      _p_d_real_seq_len, _tw._padding_id, 0, _src_emb_clip_max / _quant_range);
+
+  for (_layer_id = 0; _layer_id < _tw._n_enc_layer; _layer_id++) {
+    _weight_offset = _layer_id * _tw._weight_per_enc_layer;
+    self_attention();
+    ffn_add_norm();
+  }
+
+  compute_ppl();
+  return;
+}
+
+template <OperationType OpType_>
+int QuantGptEncoder<OpType_>::run_one_sample(int batch_size,
+                                             int batch_seq_len) {
+  if (batch_size > _max_batch_size) {
+    throw std::runtime_error("batch size of input greater than max_batch_size");
+  }
+  if (batch_seq_len > _tw._max_step) {
+    throw std::runtime_error("seq len of input greater than max_step");
+  }
+  _batch_size = batch_size;
+  _batch_seq_len = batch_seq_len;
+  _batch_token_num = batch_size * batch_seq_len;
+
+  CHECK_GPU_ERROR(cudaMemcpyAsync(_p_d_real_seq_len, _h_real_seq_len.data(),
+                                  sizeof(int) * _batch_size,
+                                  cudaMemcpyHostToDevice, _stream));
+  CHECK_GPU_ERROR(cudaMemcpyAsync(_p_d_ppl, _h_ppl.data(),
+                                  sizeof(float) * _batch_size,
+                                  cudaMemcpyHostToDevice, _stream));
+  CHECK_GPU_ERROR(cudaMemcpyAsync(_p_d_sample_id, _p_d_token_id,
+                                  sizeof(int) * _batch_size * _batch_seq_len,
+                                  cudaMemcpyDeviceToDevice, _stream));
+#ifdef DEBUG_RESULT
+  std::cout << "batch_size-" << batch_size << " batch_seq_len-" << batch_seq_len
+            << std::endl;
+  std::cout << "Sample with " << _tw._sampling_method << std::endl;
+  std::cout << "padding_id: " << _tw._padding_id << std::endl;
+  std::cout << "vocab_size: " << _tw._src_vocab_size << std::endl;
+  print_vec(_p_d_sample_id, "batch_token_ids", batch_size * batch_seq_len);
+#endif
+
+  // token embedding, add position embedding and layer_norm
+  ker_gpt_embedding_i8I_launcher<_DataType>(
+      _batch_size, _batch_seq_len, _tw._hidden_size, _stream,
+      _int8_p_d_src_emb_bottom_wei, _p_device_emb[1], _p_d_sample_id,
+      _p_d_query, _p_d_real_seq_len, _tw._padding_id, 0,
+      _src_emb_clip_max / _quant_range);
+
+  for (_layer_id = 0; _layer_id < _tw._n_enc_layer; _layer_id++) {
+    _weight_offset = _layer_id * _tw._weight_per_enc_layer;
+    self_attention();
+    ffn_add_norm();
+  }
+
+  int8_t **ftmp = _p_d_self_k_cache2;
+  _p_d_self_k_cache2 = _p_d_self_k_cache1;
+  _p_d_self_k_cache1 = ftmp;
+  ftmp = _p_d_self_v_cache2;
+  _p_d_self_v_cache2 = _p_d_self_v_cache1;
+  _p_d_self_v_cache1 = ftmp;
+
+  if (sample_one_token() == 0 || _batch_seq_len >= _tw._max_step) {
+    CHECK_GPU_ERROR(cudaMemcpyAsync(_p_d_sample_id_buf, _p_d_sample_id,
+                                    _batch_token_num * sizeof(int),
+                                    cudaMemcpyDeviceToDevice, _stream));
+    CHECK_GPU_ERROR(cudaStreamSynchronize(_stream));
+    return _batch_seq_len;
+  }
+
+  while (1) {
+#ifdef DEBUG_RESULT
+    std::cout << "before sample:batch_size-" << _batch_size << " batch_seq_len-"
+              << _batch_seq_len << std::endl;
+    print_vec(_p_d_sample_id, "batch_token_ids", _batch_token_num);
+#endif
+
+    // token embedding, add position embedding and layer_norm
+    ker_gpt_embedding_i8I_launcher<_DataType>(
+        batch_size, 1, _tw._hidden_size, _stream, _int8_p_d_src_emb_bottom_wei,
+        _p_device_emb[1], _p_d_last_sample_id, _p_d_query, _p_d_real_seq_len,
+        _tw._padding_id, _batch_seq_len - 1, _src_emb_clip_max / _quant_range);
+
+    for (_layer_id = 0; _layer_id < _tw._n_enc_layer; _layer_id++) {
+      _weight_offset = _layer_id * _tw._weight_per_enc_layer;
+      self_attention_with_cache();
+      ffn_add_norm_with_cache();
+    }
+
+    int8_t **ftmp = _p_d_self_k_cache2;
+    _p_d_self_k_cache2 = _p_d_self_k_cache1;
+    _p_d_self_k_cache1 = ftmp;
+    ftmp = _p_d_self_v_cache2;
+    _p_d_self_v_cache2 = _p_d_self_v_cache1;
+    _p_d_self_v_cache1 = ftmp;
+
+    if (sample_one_token_with_cache() == 0 || _batch_seq_len >= _tw._max_step)
+      break;
+  }
+
+  CHECK_GPU_ERROR(cudaMemcpyAsync(_p_d_sample_id_buf, _p_d_sample_id,
+                                  _batch_token_num * sizeof(int),
+                                  cudaMemcpyDeviceToDevice, _stream));
+  CHECK_GPU_ERROR(cudaStreamSynchronize(_stream));
+
+  return _batch_seq_len;
+}
+
+template <OperationType OpType_>
+int QuantGptEncoder<OpType_>::sample_one_token() {
+  /* ---step 1. project hidden states to vocab logits--- */
+  cublasLtMM_withAlgo_i8IO(_int8_ffn_out_buf, 1, _batch_token_num,
+                           _tw._src_vocab_size, _tw._hidden_size, 0, 0, 0,
+                           _output_ln_clip_max * _src_emb_clip_max /
+                               (_logits_clip_max * _quant_range),
+                           _int8_ffn_in_buf, _int8_p_d_src_emb_wei,
+                           _cublas_lt_handle, _stream, false);
+  CHECK_GPU_ERROR(cudaMemsetAsync(_p_d_unfinished, 0, sizeof(int), _stream));
+  /* ---step 2. sample new tokens from logits */
+  if (_tw._sampling_method == "topk") {
+#ifdef DEBUG_RESULT
+    std::cout << "sampling using topk\n";
+#endif
+    ker_topk_sample_i8I_launcher(
+        _batch_size, _batch_seq_len, _batch_seq_len, _max_thread_per_block,
+        _stream, _int8_ffn_out_buf, _p_d_sample_id, _p_d_sample_id_buf,
+        _p_d_real_seq_len, _tw._src_vocab_size, _tw._topk, _p_d_unfinished,
+        _p_d_curandstate, _tw._eos_id, _logits_clip_max / _quant_range, true);
+  } else {
+#ifdef DEBUG_RESULT
+    std::cout << "sampling using topp\n";
+#endif
+    ker_topp_sample_i8I_launcher(
+        _batch_size, _batch_seq_len, _batch_seq_len, _max_thread_per_block,
+        _stream, _int8_ffn_out_buf, _p_d_sample_id, _p_d_sample_id_buf,
+        _p_d_real_seq_len, _tw._src_vocab_size, _tw._topp, _p_d_unfinished,
+        _p_d_curandstate, _tw._eos_id, _logits_clip_max / _quant_range, true);
+  }
+  int *temp = _p_d_sample_id;
+  _p_d_sample_id = _p_d_sample_id_buf;
+  _p_d_sample_id_buf = temp;
+  CHECK_GPU_ERROR(cudaMemcpyAsync(&_h_unfinished, _p_d_unfinished, sizeof(int),
+                                  cudaMemcpyDeviceToHost, _stream));
+  CHECK_GPU_ERROR(cudaStreamSynchronize(_stream));
+  _p_d_last_sample_id = _p_d_sample_id_buf + _batch_token_num;
+  _batch_seq_len++;
+  _batch_token_num += _batch_size;
+  return _h_unfinished;
+}
+
+template <OperationType OpType_>
+int QuantGptEncoder<OpType_>::sample_one_token_with_cache() {
+  /* ---step 1. project hidden states to vocab logits--- */
+  cublasLtMM_withAlgo_i8IO(_int8_ffn_out_buf, 1, _batch_size,
+                           _tw._src_vocab_size, _tw._hidden_size, 0, 0, 0,
+                           _output_ln_clip_max * _src_emb_clip_max /
+                               (_logits_clip_max * _quant_range),
+                           _int8_ffn_in_buf, _int8_p_d_src_emb_wei,
+                           _cublas_lt_handle, _stream, false);
+
+  CHECK_GPU_ERROR(cudaMemsetAsync(_p_d_unfinished, 0, sizeof(int), _stream));
+  // /* ---step 2. sample new tokens from logits */
+  if (_tw._sampling_method == "topk") {
+#ifdef DEBUG_RESULT
+    std::cout << "sampling using topk\n";
+#endif
+    ker_topk_sample_i8I_launcher(
+        _batch_size, _batch_seq_len, 1, _max_thread_per_block, _stream,
+        _int8_ffn_out_buf, _p_d_sample_id, _p_d_sample_id_buf,
+        _p_d_real_seq_len, _tw._src_vocab_size, _tw._topk, _p_d_unfinished,
+        _p_d_curandstate, _tw._eos_id, _logits_clip_max / _quant_range, true);
+  } else {
+#ifdef DEBUG_RESULT
+    std::cout << "sampling using topp\n";
+#endif
+    ker_topp_sample_i8I_launcher(
+        _batch_size, _batch_seq_len, 1, _max_thread_per_block, _stream,
+        _int8_ffn_out_buf, _p_d_sample_id, _p_d_sample_id_buf,
+        _p_d_real_seq_len, _tw._src_vocab_size, _tw._topp, _p_d_unfinished,
+        _p_d_curandstate, _tw._eos_id, _logits_clip_max / _quant_range, true);
+  }
+  int *temp = _p_d_sample_id;
+  _p_d_sample_id = _p_d_sample_id_buf;
+  _p_d_sample_id_buf = temp;
+  CHECK_GPU_ERROR(cudaMemcpyAsync(&_h_unfinished, _p_d_unfinished, sizeof(int),
+                                  cudaMemcpyDeviceToHost, _stream));
+  CHECK_GPU_ERROR(cudaStreamSynchronize(_stream));
+  _p_d_last_sample_id = _p_d_sample_id_buf + _batch_token_num;
+  _batch_seq_len++;
+  _batch_token_num += _batch_size;
+  return _h_unfinished;
+}
+
+template <OperationType OpType_>
+void QuantGptEncoder<OpType_>::self_attention() {
+  /* ---step 0. layer_norm, add output_bias to "query"--- */
+  if (_layer_id == 0) {
+    ker_norm_layer_resual_i8O_launcher<_DataType>(
+        _batch_token_num, _tw._hidden_size, _stream, _p_d_query,
+        _int8_ffn_in_buf, _p_device_wei[_weight_offset],
+        _p_device_wei[_weight_offset + 1], _p_device_wei[_weight_offset + 5],
+        _max_thread_per_block, _quant_range / _enc_clip_max[_layer_id * 12 + 4],
+        false, true);
+  }
+
+  cublasLtMM_withAlgo_i8IO(
+      _int8_ffn_out_buf, 1, _batch_token_num, _tw._hidden_size * 3,
+      _tw._hidden_size, 0, 0, 0,
+      _enc_clip_max[_layer_id * 12] * _enc_clip_max[_layer_id * 12 + 4] /
+          (_enc_clip_max[_layer_id * 12 + 8] * _quant_range),
+      _int8_ffn_in_buf, _int8_p_d_enc_wei[_layer_id * 4], _cublas_lt_handle,
+      _stream, false);
+
+#ifdef DEBUG_RESULT
+  print_vec(_int8_ffn_in_buf, "attn qkv in", 20);
+  print_vec(_int8_p_d_enc_wei[_layer_id * 4], "attn qkv w", 20);
+  print_vec(_int8_ffn_out_buf, "attn qkv out", 20);
+#endif
+
+  // get q, k, v by split and reshape qkv
+  ker_arrange_encself_qkv_i8I_i8O_launcher<_DataType>(
+      _batch_token_num, _tw._hidden_size, _stream, _int8_ffn_out_buf,
+      _p_device_wei[_weight_offset + 3], _int8_ffn_in_buf,
+      _p_d_self_k_cache1[_layer_id], _p_d_self_v_cache1[_layer_id], _p_d_v,
+      _batch_seq_len, _tw._dim_per_head, _tw._head_num, _max_thread_per_block,
+      _enc_clip_max[_layer_id * 12 + 8] / _quant_range,
+      _quant_range / _enc_clip_max[_layer_id * 12 + 11], true);
+
+  /* ---step 2. correlation = q * k, perform softmax on correlation--- */
+  CHECK_GPU_ERROR(cublasGemmStridedBatchedEx(
+      _hd, CUBLAS_OP_T, CUBLAS_OP_N, _batch_seq_len, _batch_seq_len,
+      _tw._dim_per_head, &_ione, _p_d_self_k_cache1[_layer_id], CUDA_R_8I,
+      _tw._dim_per_head, _batch_seq_len * _tw._dim_per_head, _int8_ffn_in_buf,
+      CUDA_R_8I, _tw._dim_per_head, _batch_seq_len * _tw._dim_per_head, &_izero,
+      _int32_ffn_out_buf, CUDA_R_32I, _batch_seq_len,
+      _batch_seq_len * _batch_seq_len, _batch_size * _tw._head_num, CUDA_R_32I,
+      CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+
+  ker_correlation_softmax_gpt_i32I_launcher<_DataType>(
+      _batch_size, _batch_seq_len, _tw._head_num, _stream, _int32_ffn_out_buf,
+      _p_d_c, _p_d_real_seq_len, _atten_scaler,
+      _enc_clip_max[_layer_id * 12 + 11] / _quant_range);
+
+  /* ---step 3. new_q = correlation * v--- */
+  CHECK_GPU_ERROR(cublasGemmStridedBatchedEx(
+      _hd, CUBLAS_OP_N, CUBLAS_OP_N, _tw._dim_per_head, _batch_seq_len,
+      _batch_seq_len, &_fone, _p_d_v, _AType, _tw._dim_per_head,
+      _batch_seq_len * _tw._dim_per_head, _p_d_c, _BType, _batch_seq_len,
+      _batch_seq_len * _batch_seq_len, &_fzero, _p_d_q, _CType,
+      _tw._dim_per_head, _batch_seq_len * _tw._dim_per_head,
+      _batch_size * _tw._head_num, _computeType,
+      CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+
+  // use v to save reshaped q, since they are in same size and v
+  // will not be use again before the next multi-head-attention
+  ker_arrange_atten_output_i8O_launcher<_DataType>(
+      _batch_token_num, _tw._hidden_size, _stream, _p_d_q, _int8_ffn_in_buf,
+      _batch_seq_len, _tw._dim_per_head, _tw._head_num, _max_thread_per_block,
+      _quant_range / _enc_clip_max[_layer_id * 12 + 5], false);
+
+  /* ---step 4. new_q = ori_q + new_q * output_wei--- */
+  cublaslt_gemm(
+      _int8_p_d_enc_wei[_layer_id * 4 + 1], _int8_ffn_in_buf, _int8_ffn_out_buf,
+      1, _tw._hidden_size, _batch_token_num, _tw._hidden_size, 0, 0, 0,
+      _enc_clip_max[_layer_id * 12 + 1] * _enc_clip_max[_layer_id * 12 + 5] /
+          (_enc_clip_max[_layer_id * 12 + 9] * _quant_range),
+      _cublas_lt_handle, _stream);
+
+#ifdef DEBUG_RESULT
+  print_vec(_int8_ffn_in_buf, "attn out in", 20);
+  print_vec(_int8_p_d_enc_wei[_layer_id * 4 + 1], "attn out w", 20);
+  print_vec(_int8_ffn_out_buf, "attn out out", 20);
+#endif
+
+  ker_residual_bias_ln_i8I_i8O_launcher<_DataType>(
+      _int8_ffn_out_buf, _p_device_wei[_weight_offset + 6],
+      _p_device_wei[_weight_offset + 7], _p_device_wei[_weight_offset + 11],
+      _int8_ffn_in_buf, _p_d_query, _batch_token_num, _tw._hidden_size,
+      _enc_clip_max[_layer_id * 12 + 9] / _quant_range,
+      _quant_range / _enc_clip_max[_layer_id * 12 + 6], _max_thread_per_block,
+      _stream, false, false, true);
+
+  return;
+}
+
+template <OperationType OpType_>
+void QuantGptEncoder<OpType_>::self_attention_with_cache() {
+  /* ---step 0. layer_norm, add output_bias to "query"--- */
+  if (_layer_id == 0) {
+    ker_norm_layer_resual_i8O_launcher<_DataType>(
+        _batch_size, _tw._hidden_size, _stream, _p_d_query, _int8_ffn_in_buf,
+        _p_device_wei[_weight_offset], _p_device_wei[_weight_offset + 1],
+        _p_device_wei[_weight_offset + 5], _max_thread_per_block,
+        _quant_range / _enc_clip_max[_layer_id * 12 + 4], false, true);
+  }
+
+  /* ---step 1. qkv = ori_q * qkv_wei + bias, and reshape qkv for multi-head
+   * gemm--- */
+  cublasLtMM_withAlgo_i8IO(
+      _int8_ffn_out_buf, 1, _batch_size, _tw._hidden_size * 3, _tw._hidden_size,
+      0, 0, 0,
+      _enc_clip_max[_layer_id * 12] * _enc_clip_max[_layer_id * 12 + 4] /
+          (_enc_clip_max[_layer_id * 12 + 8] * _quant_range),
+      _int8_ffn_in_buf, _int8_p_d_enc_wei[_layer_id * 4], _cublas_lt_handle,
+      _stream, false);
+
+  // get q, k, v by split and reshape qkv
+  ker_arrange_qkv_with_cache_i8I_i8O_launcher<_DataType>(
+      _batch_token_num, _tw._hidden_size, _stream, _int8_ffn_out_buf,
+      _p_device_wei[_weight_offset + 3], _int8_ffn_in_buf,
+      _p_d_self_k_cache1[_layer_id], _p_d_self_k_cache2[_layer_id],
+      _p_d_self_v_cache1[_layer_id], _p_d_self_v_cache2[_layer_id],
+      _batch_seq_len, _tw._dim_per_head, _tw._head_num,
+      _enc_clip_max[_layer_id * 12 + 8] / _quant_range,
+      _quant_range / _enc_clip_max[_layer_id * 12 + 11], true);
+
+  /* ---step 2. correlation = q * k, perform softmax on correlation
+  correlation: [batch_size, heads_num, 1, batch_seq_len]--- */
+  CHECK_GPU_ERROR(cublasGemmStridedBatchedEx(
+      _hd, CUBLAS_OP_T, CUBLAS_OP_N, _batch_seq_len, 1, _tw._dim_per_head,
+      &_ione, _p_d_self_k_cache1[_layer_id], CUDA_R_8I, _tw._dim_per_head,
+      _batch_seq_len * _tw._dim_per_head, _int8_ffn_in_buf, CUDA_R_8I,
+      _tw._dim_per_head, _tw._dim_per_head, &_izero, _int32_ffn_out_buf,
+      CUDA_R_32I, _batch_seq_len, _batch_seq_len, _batch_size * _tw._head_num,
+      CUDA_R_32I, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+
+  ker_fuse_softmax_new_value_i32I_i8O_launcher(
+      _int32_ffn_out_buf, _p_d_self_v_cache1[_layer_id], _int8_ffn_in_buf,
+      _batch_size * _tw._head_num, _batch_seq_len, _batch_seq_len,
+      _tw._head_num, _tw._dim_per_head, float(_atten_scaler),
+      _enc_clip_max[_layer_id * 12 + 11] / _quant_range,
+      _quant_range / _enc_clip_max[_layer_id * 12 + 5], false, _stream);
+
+  /* ---step 4. new_q = ori_q + new_q * output_wei--- */
+  cublaslt_gemm(
+      _int8_p_d_enc_wei[_layer_id * 4 + 1], _int8_ffn_in_buf, _int8_ffn_out_buf,
+      1, _tw._hidden_size, _batch_size, _tw._hidden_size, 0, 0, 0,
+      _enc_clip_max[_layer_id * 12 + 1] * _enc_clip_max[_layer_id * 12 + 5] /
+          (_enc_clip_max[_layer_id * 12 + 9] * _quant_range),
+      _cublas_lt_handle, _stream);
+
+  ker_residual_bias_ln_i8I_i8O_launcher<_DataType>(
+      _int8_ffn_out_buf, _p_device_wei[_weight_offset + 6],
+      _p_device_wei[_weight_offset + 7], _p_device_wei[_weight_offset + 11],
+      _int8_ffn_in_buf, _p_d_query, _batch_size, _tw._hidden_size,
+      _enc_clip_max[_layer_id * 12 + 9] / _quant_range,
+      _quant_range / _enc_clip_max[_layer_id * 12 + 6], _max_thread_per_block,
+      _stream, false, false, true);
+  return;
+}
+
+template <OperationType OpType_>
+void QuantGptEncoder<OpType_>::ffn_add_norm() {
+  /* ---step 1. first ffn layer--- */
+  cublasLtMM_withAlgo_i8IO(
+      _int8_ffn_out_buf, 1, _batch_token_num, _tw._inner_size, _tw._hidden_size,
+      0, 0, 0,
+      _enc_clip_max[_layer_id * 12 + 2] * _enc_clip_max[_layer_id * 12 + 6] /
+          (_enc_clip_max[_layer_id * 12 + 10] * _quant_range),
+      _int8_ffn_in_buf, _int8_p_d_enc_wei[_layer_id * 4 + 2], _cublas_lt_handle,
+      _stream, false);
+
+#ifdef DEBUG_RESULT
+  print_vec(_int8_ffn_in_buf, "ffn1 in", 20);
+  print_vec(_int8_p_d_enc_wei[_layer_id * 4 + 2], "ffn1 w", 20);
+  print_vec(_int8_ffn_out_buf, "ffn1 out", 20);
+#endif
+
+  ker_bias_gelu_i8I_i8O_launcher<_DataType>(
+      _batch_token_num, _stream, _int8_ffn_out_buf, _int8_ffn_in_buf,
+      _p_device_wei[_weight_offset + 9], _tw._inner_size,
+      _enc_clip_max[_layer_id * 12 + 10] / _quant_range,
+      _quant_range / _enc_clip_max[_layer_id * 12 + 7], true, false);
+
+  /* ---step 2. second ffn layer--- */
+  cublaslt_gemm(_int8_p_d_enc_wei[_layer_id * 4 + 3], _int8_ffn_in_buf,
+                _int32_ffn_out_buf, 1, _tw._hidden_size, _batch_token_num,
+                _tw._inner_size, 0, 0, 0, 1, _cublas_lt_handle, _stream);
+
+#ifdef DEBUG_RESULT
+  print_vec(_int8_ffn_in_buf, "ffn2 in", 20);
+  print_vec(_int8_p_d_enc_wei[_layer_id * 4 + 3], "ffn2 w", 20);
+  print_vec(_int32_ffn_out_buf, "ffn2 out", 20);
+#endif
+
+  const _DataType *scale_ptr, *bias_ptr, *res_bias_ptr;
+  float clip_max, dequant_scale;
+  dequant_scale = _enc_clip_max[_layer_id * 12 + 3] *
+                  _enc_clip_max[_layer_id * 12 + 7] /
+                  (_quant_range * _quant_range);
+  if (_layer_id == _tw._n_enc_layer - 1) {
+    scale_ptr = _p_device_emb[2];
+    bias_ptr = _p_device_emb[3];
+    res_bias_ptr = nullptr;
+    clip_max = _output_ln_clip_max;
+  } else {
+    scale_ptr = _p_device_wei[(_layer_id + 1) * _tw._weight_per_enc_layer];
+    bias_ptr = _p_device_wei[(_layer_id + 1) * _tw._weight_per_enc_layer + 1];
+    res_bias_ptr =
+        _p_device_wei[(_layer_id + 1) * _tw._weight_per_enc_layer + 5];
+    clip_max = _enc_clip_max[(_layer_id + 1) * 12 + 4];
+  }
+
+  ker_residual_bias_ln_i32I_i8O_launcher<_DataType>(
+      _int32_ffn_out_buf, scale_ptr, bias_ptr, res_bias_ptr, _int8_ffn_in_buf,
+      _p_d_query, _batch_token_num, _tw._hidden_size, dequant_scale,
+      _quant_range / clip_max, _max_thread_per_block, _stream, false, false,
+      true, _scaled_ffn2_colsum[_layer_id]);
+
+  return;
+}
+
+template <OperationType OpType_>
+void QuantGptEncoder<OpType_>::ffn_add_norm_with_cache() {
+  /* ---step 1. first ffn layer--- */
+  cublasLtMM_withAlgo_i8IO(
+      _int8_ffn_out_buf, 1, _batch_size, _tw._inner_size, _tw._hidden_size, 0,
+      0, 0,
+      _enc_clip_max[_layer_id * 12 + 2] * _enc_clip_max[_layer_id * 12 + 6] /
+          (_enc_clip_max[_layer_id * 12 + 10] * _quant_range),
+      _int8_ffn_in_buf, _int8_p_d_enc_wei[_layer_id * 4 + 2], _cublas_lt_handle,
+      _stream, false);
+
+  ker_bias_gelu_i8I_i8O_launcher<_DataType>(
+      _batch_size, _stream, _int8_ffn_out_buf, _int8_ffn_in_buf,
+      _p_device_wei[_weight_offset + 9], _tw._inner_size,
+      _enc_clip_max[_layer_id * 12 + 10] / _quant_range,
+      _quant_range / _enc_clip_max[_layer_id * 12 + 7], true, false);
+
+  /* ---step 2. second ffn layer--- */
+  cublaslt_gemm(_int8_p_d_enc_wei[_layer_id * 4 + 3], _int8_ffn_in_buf,
+                _int32_ffn_out_buf, 1, _tw._hidden_size, _batch_size,
+                _tw._inner_size, 0, 0, 0, 1, _cublas_lt_handle, _stream);
+
+  const _DataType *scale_ptr, *bias_ptr, *res_bias_ptr;
+  float clip_max, dequant_scale;
+  dequant_scale = _enc_clip_max[_layer_id * 12 + 3] *
+                  _enc_clip_max[_layer_id * 12 + 7] /
+                  (_quant_range * _quant_range);
+  if (_layer_id == _tw._n_enc_layer - 1) {
+    scale_ptr = _p_device_emb[2];
+    bias_ptr = _p_device_emb[3];
+    res_bias_ptr = nullptr;
+    clip_max = _output_ln_clip_max;
+  } else {
+    scale_ptr = _p_device_wei[(_layer_id + 1) * _tw._weight_per_enc_layer];
+    bias_ptr = _p_device_wei[(_layer_id + 1) * _tw._weight_per_enc_layer + 1];
+    res_bias_ptr =
+        _p_device_wei[(_layer_id + 1) * _tw._weight_per_enc_layer + 5];
+    clip_max = _enc_clip_max[(_layer_id + 1) * 12 + 4];
+  }
+
+  ker_residual_bias_ln_i32I_i8O_launcher<_DataType>(
+      _int32_ffn_out_buf, scale_ptr, bias_ptr, res_bias_ptr, _int8_ffn_in_buf,
+      _p_d_query, _batch_size, _tw._hidden_size, dequant_scale,
+      _quant_range / clip_max, _max_thread_per_block, _stream, false, false,
+      true, _scaled_ffn2_colsum[_layer_id]);
+
+  return;
+}
+
+/**
+Compute ppl from encoder output
+*/
+template <OperationType OpType_>
+void QuantGptEncoder<OpType_>::compute_ppl() {
+  /* ---step 1. project hidden states to vocab logits--- */
+  cublasLtMM_withAlgo_i8IO(_int8_ffn_out_buf, 1, _batch_token_num,
+                           _tw._src_vocab_size, _tw._hidden_size, 0, 0, 0,
+                           _output_ln_clip_max * _src_emb_clip_max /
+                               (_logits_clip_max * _quant_range),
+                           _int8_ffn_in_buf, _int8_p_d_src_emb_wei,
+                           _cublas_lt_handle, _stream, false);
+#ifdef DEBUG_RESULT
+  print_vec(_int8_ffn_in_buf, "logits in", 20);
+  print_vec(_int8_p_d_src_emb_wei, "logits w", 20);
+  print_vec(_int8_ffn_out_buf, "logits out", 20);
+#endif
+
+  /* ---step 2. compute language model ppl--- */
+  ker_ppl_i8I_launcher(_batch_size, _batch_seq_len, _max_thread_per_block,
+                       _stream, _int8_ffn_out_buf, _p_d_token_id,
+                       _p_d_real_seq_len, _p_d_ppl, _tw._src_vocab_size,
+                       _logits_clip_max / _quant_range, true);
+}
+
+template class QuantGptEncoder<OperationType::FP16>;
+template class QuantGptEncoder<OperationType::FP32>;
+
+}  // namespace cuda
+}  // namespace lightseq
diff --git a/lightseq/inference/model/quant_gpt_encoder.h b/lightseq/inference/model/quant_gpt_encoder.h
new file mode 100644
index 00000000..e399d1bd
--- /dev/null
+++ b/lightseq/inference/model/quant_gpt_encoder.h
@@ -0,0 +1,134 @@
+#pragma once
+
+#include <cublas_v2.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <curand_kernel.h>
+#include <thrust/functional.h>
+#include <thrust/sequence.h>
+#include <cublasLt.h>
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <iostream>
+#include <string>
+
+#include "../proto/quant_gpt_weight.h"
+#include "../tools/util.h"
+
+namespace lightseq {
+namespace cuda {
+
+template <OperationType OpType_>
+class QuantGptEncoder {
+ private:
+  typedef OperationTypeTraits<OpType_> _optraits;
+  typedef typename _optraits::DataType _DataType;
+  const cudaDataType_t _computeType = _optraits::computeType;
+  const cudaDataType_t _AType = _optraits::AType;
+  const cudaDataType_t _BType = _optraits::BType;
+  const cudaDataType_t _CType = _optraits::CType;
+
+  // private member function
+  void self_attention();
+  void self_attention_with_cache();
+  void ffn_add_norm();
+  void ffn_add_norm_with_cache();
+  int sample_one_token();
+  int sample_one_token_with_cache();
+
+  const int _max_batch_size;
+
+  const QuantGptWeight<OpType_> &_tw;
+  cudaStream_t _stream;
+  cudaStream_t _cache_stream;
+  cublasHandle_t _hd;
+  cublasLtHandle_t _cublas_lt_handle;
+  const _DataType _fone;
+  const _DataType _fzero;
+  const int32_t _ione;
+  const int32_t _izero;
+  const _DataType _atten_scaler;
+  const int _max_batch_dim;
+  const int _max_thread_per_block;
+  std::vector<int> _h_real_seq_len;
+  std::vector<float> _h_ppl;
+  std::vector<int> _h_sample_id;
+  int _h_unfinished;
+
+  // gpu memory buffer
+  _DataType *_p_d_query;
+  _DataType *_p_d_k_cache;
+  _DataType *_p_d_v_cache;
+  _DataType *_p_d_qkv_projected;
+  _DataType *_p_d_q;
+  _DataType *_p_d_k;
+  _DataType *_p_d_v;
+  _DataType *_p_d_c;
+  _DataType *_p_d_ffn_buf1;
+  _DataType *_p_d_ffn_buf2;
+  _DataType *_p_d_logit;
+  int *_p_d_real_seq_len;   // [batch_size]
+  int *_p_d_sample_id_buf;  // [batch_size, max_step]
+  int *_p_d_last_sample_id;
+  int *_p_d_unfinished;
+  curandState *_p_d_curandstate;  //[batch_size]
+
+  int8_t *_int8_ffn_in_buf;
+  int32_t *_int32_ffn_out_buf;
+  int8_t *_int8_ffn_out_buf;
+  std::vector<int8_t *> _p_d_self_k_cache;
+  std::vector<int8_t *> _p_d_self_v_cache;
+  int8_t **_p_d_self_k_cache1;
+  int8_t **_p_d_self_k_cache2;
+  int8_t **_p_d_self_v_cache1;
+  int8_t **_p_d_self_v_cache2;
+
+  // {token_emb, pos_emb, norm_scale, norm_bias}
+  const std::vector<const _DataType *> &_p_d_src_emb_wei;
+  // {multihead_norm_scale, multihead_norm_bias, multihead_qkv_kernel,
+  // multihead_qkv_bias multihead_output_kernel, multihead_output_bias
+  // ffn_norm_scale, ffn_norm_bias}
+  // ffn_first_kernel, ffn_first_bias, ffn_second_kernel, ffn_second_bias} *
+  // encoder_layer_num
+  const std::vector<const _DataType *> &_p_d_enc_wei;
+  std::vector<const _DataType *> _p_device_wei;
+  std::vector<const _DataType *> _p_device_emb;
+
+  std::vector<int8_t *> _int8_p_d_enc_wei;
+  int8_t *_int8_p_d_src_emb_wei;
+  int8_t *_int8_p_d_src_emb_bottom_wei;
+  const float _quant_range = 127;
+  const float _src_emb_clip_max;
+  const float _output_ln_clip_max;
+  const float _logits_clip_max;
+  const std::vector<float> _enc_clip_max;  // size: 12 * enc_layer_num
+  std::vector<_DataType *> _scaled_ffn2_colsum;
+
+  int _batch_size;
+  int _batch_token_num;
+  int _layer_id;
+  int _weight_offset;
+
+  const std::set<std::string> kSamplingMethods = {"topk", "topp", "ppl"};
+
+ public:
+  int _batch_seq_len;
+  const int *_p_d_token_id;  // input token id, [batch_size, batch_seq_len]
+  float *_p_d_ppl;           // ppl for every seq, [batch_size]
+  int *_p_d_sample_id;
+
+  QuantGptEncoder(int max_batch_size, const int *p_d_token_id, float *p_d_ppl,
+                  int *p_d_sample_id, const QuantGptWeight<OpType_> &tw,
+                  cudaStream_t stream, cudaStream_t cache_stream,
+                  cublasHandle_t hd);
+  void init_buffer();
+  std::string check();
+  void run_one_infer(int batch_size, int batch_seq_len);
+  int run_one_sample(int batch_size, int batch_seq_len);
+  void compute_ppl();
+};
+
+}  // namespace cuda
+}  // namespace lightseq
diff --git a/lightseq/inference/proto/CMakeLists.txt b/lightseq/inference/proto/CMakeLists.txt
index bfa710f5..aa8ab7c4 100644
--- a/lightseq/inference/proto/CMakeLists.txt
+++ b/lightseq/inference/proto/CMakeLists.txt
@@ -9,7 +9,9 @@ include_directories(${Protobuf_INCLUDE_DIRS})
 include_directories(${CMAKE_CURRENT_BINARY_DIR})
 
 protobuf_generate_cpp(GPT_PROTO_SRC GPT_PROTO_HEADER gpt.proto)
+protobuf_generate_cpp(Q_GPT_PROTO_SRC Q_GPT_PROTO_HEADER quant_gpt.proto)
 protobuf_generate_cpp(BERT_PROTO_SRC BERT_PROTO_HEADER bert.proto)
+protobuf_generate_cpp(Q_BERT_PROTO_SRC Q_BERT_PROTO_HEADER quant_bert.proto)
 protobuf_generate_cpp(Q_TRANSFORMER_PROTO_SRC Q_TRANSFORMER_PROTO_HEADER
                       quant_transformer.proto)
 protobuf_generate_cpp(TRANSFORMER_PROTO_SRC TRANSFORMER_PROTO_HEADER
@@ -24,12 +26,25 @@ target_link_libraries(gpt_weight PUBLIC utils ${Protobuf_LIBRARIES}
 target_include_directories(gpt_weight PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 target_include_directories(gpt_weight PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
 
+add_library(quant_gpt_weight STATIC quant_gpt_weight.cc ${Q_GPT_PROTO_SRC}
+                                    ${Q_GPT_PROTO_HEADER})
+target_link_libraries(quant_gpt_weight PUBLIC utils ${Protobuf_LIBRARIES}
+                                              ${HDF5_LIBRARIES})
+target_include_directories(quant_gpt_weight PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_include_directories(quant_gpt_weight PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
+
 add_library(bert_weight STATIC bert_weight.cc ${BERT_PROTO_SRC}
                                ${BERT_PROTO_HEADER})
 target_link_libraries(bert_weight PUBLIC utils ${Protobuf_LIBRARIES})
 target_include_directories(bert_weight PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 target_include_directories(bert_weight PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
 
+add_library(quant_bert_weight STATIC quant_bert_weight.cc ${Q_BERT_PROTO_SRC}
+                                     ${Q_BERT_PROTO_HEADER})
+target_link_libraries(quant_bert_weight PUBLIC utils ${Protobuf_LIBRARIES})
+target_include_directories(quant_bert_weight PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_include_directories(quant_bert_weight PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
+
 add_library(
   transformer_weight STATIC transformer_weight.cc ${TRANSFORMER_PROTO_SRC}
                             ${TRANSFORMER_PROTO_HEADER})
diff --git a/lightseq/inference/proto/quant_bert.proto b/lightseq/inference/proto/quant_bert.proto
new file mode 100644
index 00000000..255ef4d2
--- /dev/null
+++ b/lightseq/inference/proto/quant_bert.proto
@@ -0,0 +1,82 @@
+syntax = "proto3";
+option optimize_for = LITE_RUNTIME;
+// all the matrix are stored in row-major order,
+// plz see https://en.wikipedia.org/wiki/Row-_and_column-major_order for details
+
+// the definition of "Multi-Head Attention", "Scaled Dot-Product Attention" and
+// "Feed-Forward Networks"
+// plz see https://arxiv.org/abs/1706.03762 for details
+
+message QuantBertEncoderLayer {
+  // layer norm before "Multi-Head Attention"
+  repeated float multihead_norm_scale = 1; // [hidden_size]
+  repeated float multihead_norm_bias = 2; // [hidden_size]
+
+  // "Multi-Head Attention" linearly project weights kernel for query, key,
+  // value,
+  // before "Scaled Dot-Product Attention, with shape (hidden_size,
+  // hidden_size*3)
+  // is built by numpy.concatenate((query_kernel, key_kernel, value_kernel),
+  // axis=1)
+  // perform numpy.dot(input, multihead_project_kernel_qkv) will get the [query,
+  // key, value] of
+  // "Scaled Dot-Product Attention"
+  bytes multihead_project_kernel_qkv = 3; // [hidden_size, 3, hidden_size]
+  repeated float multihead_project_bias_qkv = 4; // [3, hidden_size]
+  bytes multihead_project_kernel_output = 5; // [hidden_size, hidden_size]
+  repeated float multihead_project_bias_output = 6; // [hidden_size]
+
+  // layer norm before "Feed-Forward Networks"
+  repeated float ffn_norm_scale = 7; // [hidden_size]
+  repeated float ffn_norm_bias = 8; // [hidden_size]
+
+  // "Feed-Forward Networks"
+  bytes ffn_first_kernel = 9; // [hidden_size, inner_size]
+  repeated float ffn_first_bias = 10; // [inner_size]
+  bytes ffn_second_kernel = 11; // [inner_size, hidden_size]
+  repeated float ffn_second_bias = 12; // [hidden_size]
+
+  // clip max
+  float multihead_project_kernel_qkv_clip_max = 13;
+  float multihead_project_kernel_output_clip_max = 14;
+  float ffn_first_kernel_clip_max = 15;
+  float ffn_second_kernel_clip_max = 16;
+  float multihead_ln_clip_max = 17;
+  float multihead_project_output_clip_max = 18;
+  float ffn_ln_clip_max = 19;
+  float ffn_first_act_clip_max = 20;
+  float multihead_qkv_dense_clip_max = 21;
+  float multihead_output_dense_clip_max = 22;
+  float ffn_first_output_clip_max = 23;
+}
+
+message QuantBertEmbeddingLayer {
+  // token embedding table
+  // look it up directly will get the input token embedding
+  bytes token_embedding = 1; // [vocab_size, hidden_size]
+  repeated float position_embedding = 2; // [max_seq_len, hidden_size]
+  // the last layer_norm of encoder,
+  // only for pre layer norm,
+  repeated float norm_scale = 3; // [hidden_size]
+  repeated float norm_bias = 4; // [hidden_size]
+
+  // clip max
+  float emb_clip_max = 5;
+}
+
+message QuantBertModelConf {
+  int32 head_num = 1;
+  int32 src_padding_id = 2;
+  bool is_post_ln = 3; // Pre-LN or Post-LN
+  bool use_gelu = 4; // use gelu for activation otherwise relu
+  // Multilingual model type, 0 for bilingual
+  // 1 for token level multilingual,
+  // 2 for sentence level multilingual
+  int32 multilg_type = 5;
+}
+
+message QuantBert {
+  QuantBertEmbeddingLayer src_embedding = 1;
+  repeated QuantBertEncoderLayer encoder_stack = 2;
+  QuantBertModelConf model_conf = 3;
+}
diff --git a/lightseq/inference/proto/quant_bert_weight.cc b/lightseq/inference/proto/quant_bert_weight.cc
new file mode 100644
index 00000000..c64250b0
--- /dev/null
+++ b/lightseq/inference/proto/quant_bert_weight.cc
@@ -0,0 +1,563 @@
+#include "quant_bert_weight.h"
+
+#include <fstream>
+
+/**
+@file
+Load the model weights which stored in custom proto file into GPU memory.
+Currently, fp16 and fp32 versions are provided.
+Weights in proto file will always be in fp32. For fp16, the weights
+  will be casted from fp32 into fp16
+*/
+
+namespace lightseq {
+namespace cuda {
+
+/**
+Cast weights into required datatype.
+The datatype of weights in custom proto file will always be in fp32.
+*/
+template <>
+float QuantBertWeight<OperationType::FP32>::float2required(float value) {
+  return value;
+}
+
+/**
+fp16 version, cast fp32 into fp16
+*/
+template <>
+__half QuantBertWeight<OperationType::FP16>::float2required(float value) {
+  return __float2half_rn(value);
+}
+
+/**
+Read model config stored in custom proto file.
+*/
+template <OperationType OpType_>
+void QuantBertWeight<OpType_>::proto_get_model_config(const QuantBert &bert) {
+  _hidden_size = bert.src_embedding().norm_scale_size();
+  _inner_size =
+      bert.encoder_stack()[0].ffn_first_kernel().size() / _hidden_size;
+  _max_step = bert.src_embedding().position_embedding_size() / _hidden_size;
+  _src_vocab_size =
+      bert.src_embedding().token_embedding().size() / _hidden_size;
+  _n_enc_layer = bert.encoder_stack_size();
+  _head_num = bert.model_conf().head_num();
+  _dim_per_head = _hidden_size / _head_num;
+  _weight_per_enc_layer = 12;
+  _padding_id = bert.model_conf().src_padding_id();
+  _is_post_ln = bert.model_conf().is_post_ln();
+  _use_gelu = bert.model_conf().use_gelu();
+  _multilg_type = bert.model_conf().multilg_type();
+}
+
+/**
+Load the weights of embedding layer into GPU memory.
+*/
+template <OperationType OpType_>
+std::string QuantBertWeight<OpType_>::proto_parse_emb_wei(
+    const QuantBertEmbeddingLayer &layer) {
+  std::vector<int> offset;
+  std::vector<float> value;
+  int idx = 0;
+
+  offset.push_back(idx);
+  if (layer.token_embedding().size() != _src_vocab_size * _hidden_size)
+    return "wrong token_embedding_size !";
+  for (unsigned char ele : layer.token_embedding())
+    value.push_back(dequantize(ele, _quant_range, layer.emb_clip_max()));
+  idx += _src_vocab_size * _hidden_size;
+  _src_emb_clip_max = layer.emb_clip_max();
+
+  offset.push_back(idx);
+  if (layer.position_embedding_size() != _max_step * _hidden_size)
+    return "wrong position_embedding_size !";
+  for (float ele : layer.position_embedding()) value.push_back(ele);
+  idx += _max_step * _hidden_size;
+
+  offset.push_back(idx);
+  if (layer.norm_scale_size() != _hidden_size) return "wrong norm_scale_size !";
+  for (float ele : layer.norm_scale()) value.push_back(ele);
+  idx += _hidden_size;
+
+  offset.push_back(idx);
+  if (layer.norm_bias_size() != _hidden_size) return "wrong norm_bias_size !";
+  for (float ele : layer.norm_bias()) value.push_back(ele);
+  idx += _hidden_size;
+
+  std::vector<_DataType> raw_value;
+  for (float e : value) raw_value.push_back(float2required(e));
+  _d_src_emb_wei = raw_value;
+  for (int e : offset) _p_d_src_emb_wei.push_back(_d_src_emb_wei.data() + e);
+
+  std::cout << "finish initializing emb_wei from host to device" << std::endl;
+  return "";
+}
+
+/**
+Load the weights of encoder into GPU memory.
+*/
+template <OperationType OpType_>
+std::string QuantBertWeight<OpType_>::proto_parse_enc_wei(
+    const QuantBert &bert) {
+  std::vector<int> offset;
+  std::vector<float> value;
+  int idx = 0;
+
+  for (auto enc_layer : bert.encoder_stack()) {
+    offset.push_back(idx);
+    if (enc_layer.multihead_norm_scale_size() != _hidden_size)
+      return "wrong multihead_norm_scale_size !";
+    for (float ele : enc_layer.multihead_norm_scale()) value.push_back(ele);
+    idx += _hidden_size;
+
+    offset.push_back(idx);
+    if (enc_layer.multihead_norm_bias_size() != _hidden_size)
+      return "wrong multihead_norm_bias_size !";
+    for (float ele : enc_layer.multihead_norm_bias()) value.push_back(ele);
+    idx += _hidden_size;
+
+    offset.push_back(idx);
+    if (enc_layer.multihead_project_kernel_qkv().size() !=
+        _hidden_size * _hidden_size * 3)
+      return "wrong multihead_project_kernel_qkv_size !";
+    for (unsigned char ele : enc_layer.multihead_project_kernel_qkv())
+      value.push_back(
+          dequantize(ele, _quant_range,
+                     enc_layer.multihead_project_kernel_qkv_clip_max()));
+    idx += _hidden_size * _hidden_size * 3;
+
+    offset.push_back(idx);
+    if (enc_layer.multihead_project_bias_qkv_size() != _hidden_size * 3)
+      return "wrong multihead_project_bias_qkv_size !";
+    for (float ele : enc_layer.multihead_project_bias_qkv())
+      value.push_back(ele);
+    idx += _hidden_size * 3;
+
+    offset.push_back(idx);
+    if (enc_layer.multihead_project_kernel_output().size() !=
+        _hidden_size * _hidden_size)
+      return "wrong multihead_project_kernel_output_size !";
+    for (unsigned char ele : enc_layer.multihead_project_kernel_output())
+      value.push_back(
+          dequantize(ele, _quant_range,
+                     enc_layer.multihead_project_kernel_output_clip_max()));
+    idx += _hidden_size * _hidden_size;
+
+    offset.push_back(idx);
+    if (enc_layer.multihead_project_bias_output_size() != _hidden_size)
+      return "wrong multihead_project_bias_output_size !";
+    for (float ele : enc_layer.multihead_project_bias_output())
+      value.push_back(ele);
+    idx += _hidden_size;
+
+    offset.push_back(idx);
+    if (enc_layer.ffn_norm_scale_size() != _hidden_size)
+      return "wrong ffn_norm_scale_size !";
+    for (float ele : enc_layer.ffn_norm_scale()) value.push_back(ele);
+    idx += _hidden_size;
+
+    offset.push_back(idx);
+    if (enc_layer.ffn_norm_bias_size() != _hidden_size)
+      return "wrong ffn_norm_bias_size !";
+    for (float ele : enc_layer.ffn_norm_bias()) value.push_back(ele);
+    idx += _hidden_size;
+
+    offset.push_back(idx);
+    if (enc_layer.ffn_first_kernel().size() != _hidden_size * _inner_size)
+      return "wrong ffn_first_kernel_size !";
+    for (unsigned char ele : enc_layer.ffn_first_kernel())
+      value.push_back(
+          dequantize(ele, _quant_range, enc_layer.ffn_first_kernel_clip_max()));
+    idx += _hidden_size * _inner_size;
+
+    offset.push_back(idx);
+    if (enc_layer.ffn_first_bias_size() != _inner_size)
+      return "wrong ffn_first_bias_size !";
+    for (float ele : enc_layer.ffn_first_bias()) value.push_back(ele);
+    idx += _inner_size;
+
+    offset.push_back(idx);
+    if (enc_layer.ffn_second_kernel().size() != _hidden_size * _inner_size)
+      return "wrong ffn_second_kernel_size !";
+    for (unsigned char ele : enc_layer.ffn_second_kernel())
+      value.push_back(dequantize(ele, _quant_range,
+                                 enc_layer.ffn_second_kernel_clip_max()));
+    idx += _hidden_size * _inner_size;
+
+    offset.push_back(idx);
+    if (enc_layer.ffn_second_bias_size() != _hidden_size)
+      return "wrong ffn_second_bias_size !";
+    for (float ele : enc_layer.ffn_second_bias()) value.push_back(ele);
+    idx += _hidden_size;
+
+    _enc_clip_max.push_back(enc_layer.multihead_project_kernel_qkv_clip_max());
+    _enc_clip_max.push_back(
+        enc_layer.multihead_project_kernel_output_clip_max());
+    _enc_clip_max.push_back(enc_layer.ffn_first_kernel_clip_max());
+    _enc_clip_max.push_back(enc_layer.ffn_second_kernel_clip_max());
+    _enc_clip_max.push_back(enc_layer.multihead_ln_clip_max());
+    _enc_clip_max.push_back(enc_layer.multihead_project_output_clip_max());
+    _enc_clip_max.push_back(enc_layer.ffn_ln_clip_max());
+    _enc_clip_max.push_back(enc_layer.ffn_first_act_clip_max());
+    _enc_clip_max.push_back(enc_layer.multihead_qkv_dense_clip_max());
+    _enc_clip_max.push_back(enc_layer.multihead_output_dense_clip_max());
+    _enc_clip_max.push_back(enc_layer.ffn_first_output_clip_max());
+
+  }  // for
+
+  std::vector<_DataType> raw_value;
+  for (float e : value) raw_value.push_back(float2required(e));
+  _d_enc_wei = raw_value;
+
+  for (int e : offset) _p_d_enc_wei.push_back(_d_enc_wei.data() + e);
+  std::cout << "finish initializing enc_wei from host to device" << std::endl;
+  return "";
+}
+
+/**
+Read model config stored in custom hdf5 file.
+*/
+template <OperationType OpType_>
+void QuantBertWeight<OpType_>::hdf5_get_model_config(hid_t hdf5_file) {
+  _hidden_size = get_hdf5_dataset_size(hdf5_file, "src_embedding/norm_scale");
+
+  _inner_size =
+      get_hdf5_dataset_size(hdf5_file, "encoder_stack/0/ffn_first_kernel") /
+      _hidden_size;
+
+  _max_step =
+      get_hdf5_dataset_size(hdf5_file, "src_embedding/position_embedding") /
+      _hidden_size;
+
+  _src_vocab_size =
+      get_hdf5_dataset_size(hdf5_file, "src_embedding/token_embedding") /
+      _hidden_size;
+
+  read_hdf5_dataset_scalar(hdf5_file, "model_conf/n_encoder_stack",
+                           H5T_NATIVE_INT, &_n_enc_layer);
+
+  read_hdf5_dataset_scalar(hdf5_file, "model_conf/head_num", H5T_NATIVE_INT,
+                           &_head_num);
+
+  _dim_per_head = _hidden_size / _head_num;
+  _weight_per_enc_layer = 12;
+
+  read_hdf5_dataset_scalar(hdf5_file, "model_conf/src_padding_id",
+                           H5T_NATIVE_INT, &_padding_id);
+
+  read_hdf5_dataset_scalar(hdf5_file, "model_conf/is_post_ln", H5T_NATIVE_HBOOL,
+                           &_is_post_ln);
+
+  read_hdf5_dataset_scalar(hdf5_file, "model_conf/use_gelu", H5T_NATIVE_HBOOL,
+                           &_use_gelu);
+
+  try {
+    read_hdf5_dataset_scalar(hdf5_file, "model_conf/multilg_type",
+                             H5T_NATIVE_INT, &_multilg_type);
+  } catch (HDF5DatasetNotFoundError &e) {
+    // default value
+    _multilg_type = 0;
+  }
+}
+
+/**
+Load the weights of embedding layer into GPU memory.
+*/
+template <OperationType OpType_>
+void QuantBertWeight<OpType_>::hdf5_parse_emb_wei(hid_t hdf5_file) {
+  std::string dataset_prefix = "src_embedding";
+
+  size_t value_size = _src_vocab_size * _hidden_size +
+                      _max_step * _hidden_size + 2 * _hidden_size;
+
+  std::vector<int> offset;
+  std::vector<float> value(value_size);  // preallocate vector for performance
+  std::vector<unsigned char> value_i8(value_size);
+  std::cout << "loading " << value_size * sizeof(OpType_) / (1024 * 1024)
+            << " MB of embedding weight." << std::endl;
+  int idx = 0;
+  float clip_max;
+
+  offset.push_back(idx);
+  read_hdf5_dataset_data(
+      hdf5_file, dataset_prefix + "/token_embedding", H5T_NATIVE_UCHAR,
+      value_i8.data() + idx,
+      [=](int size) { return size != _src_vocab_size * _hidden_size; },
+      "Wrong token_embedding_size !");
+  read_hdf5_dataset_scalar(hdf5_file, dataset_prefix + "/emb_clip_max",
+                           H5T_NATIVE_FLOAT, &clip_max);
+  dequantize_array(value_i8, value, clip_max, _quant_range, idx,
+                   _src_vocab_size * _hidden_size);
+  _src_emb_clip_max = clip_max;
+  idx += _src_vocab_size * _hidden_size;
+
+  offset.push_back(idx);
+  read_hdf5_dataset_data(
+      hdf5_file, dataset_prefix + "/position_embedding", H5T_NATIVE_FLOAT,
+      value.data() + idx,
+      [=](int size) { return size != _max_step * _hidden_size; },
+      "Wrong position_embedding_size !");
+  idx += _max_step * _hidden_size;
+
+  offset.push_back(idx);
+  read_hdf5_dataset_data(
+      hdf5_file, dataset_prefix + "/norm_scale", H5T_NATIVE_FLOAT,
+      value.data() + idx, [=](int size) { return size != _hidden_size; },
+      "Wrong norm_scale_size !");
+  idx += _hidden_size;
+
+  offset.push_back(idx);
+  read_hdf5_dataset_data(
+      hdf5_file, dataset_prefix + "/norm_bias", H5T_NATIVE_FLOAT,
+      value.data() + idx, [=](int size) { return size != _hidden_size; },
+      "Wrong norm_bias_size !");
+  idx += _hidden_size;
+
+  std::vector<_DataType> raw_value;
+  raw_value.reserve(value.size());
+  for (float e : value) raw_value.push_back(float2required(e));
+  _d_src_emb_wei = raw_value;
+  for (int e : offset) _p_d_src_emb_wei.push_back(_d_src_emb_wei.data() + e);
+
+  std::cout << "Finish loading src_emb_wei from host to device" << std::endl;
+}
+
+/**
+Load the weights of encoder into GPU memory.
+*/
+template <OperationType OpType_>
+void QuantBertWeight<OpType_>::hdf5_parse_enc_wei(hid_t hdf5_file) {
+  size_t value_size =
+      (_hidden_size * 2 + _hidden_size * _hidden_size * 3 + _hidden_size * 3 +
+       _hidden_size * _hidden_size + _hidden_size * 3 +
+       _hidden_size * _inner_size + _inner_size + _hidden_size * _inner_size +
+       _hidden_size) *
+      _n_enc_layer;
+  std::vector<int> offset;
+  std::vector<float> value(value_size);
+  std::vector<unsigned char> value_i8(value_size);
+  std::cout << "loading " << value_size * sizeof(OpType_) / (1024 * 1024)
+            << " MB of encoder weight." << std::endl;
+
+  float clip_max;
+  int idx = 0;
+  for (int layer_id = 0; layer_id < _n_enc_layer; ++layer_id) {
+    std::string dataset_prefix = "encoder_stack/" + std::to_string(layer_id);
+
+    offset.push_back(idx);
+    read_hdf5_dataset_data(
+        hdf5_file, dataset_prefix + "/multihead_norm_scale", H5T_NATIVE_FLOAT,
+        value.data() + idx, [=](int size) { return size != _hidden_size; },
+        "Wrong multihead_norm_scale_size !");
+    idx += _hidden_size;
+
+    offset.push_back(idx);
+    read_hdf5_dataset_data(
+        hdf5_file, dataset_prefix + "/multihead_norm_bias", H5T_NATIVE_FLOAT,
+        value.data() + idx, [=](int size) { return size != _hidden_size; },
+        "Wrong multihead_norm_bias_size !");
+    idx += _hidden_size;
+
+    offset.push_back(idx);
+    read_hdf5_dataset_data(
+        hdf5_file, dataset_prefix + "/multihead_project_kernel_qkv",
+        H5T_NATIVE_UCHAR, value_i8.data() + idx,
+        [=](int size) { return size != _hidden_size * _hidden_size * 3; },
+        "Wrong multihead_project_kernel_qkv_size !");
+    read_hdf5_dataset_scalar(
+        hdf5_file, dataset_prefix + "/multihead_project_kernel_qkv_clip_max",
+        H5T_NATIVE_FLOAT, &clip_max);
+    dequantize_array(value_i8, value, clip_max, _quant_range, idx,
+                     _hidden_size * _hidden_size * 3);
+    _enc_clip_max.push_back(clip_max);
+    idx += _hidden_size * _hidden_size * 3;
+
+    offset.push_back(idx);
+    read_hdf5_dataset_data(
+        hdf5_file, dataset_prefix + "/multihead_project_bias_qkv",
+        H5T_NATIVE_FLOAT, value.data() + idx,
+        [=](int size) { return size != _hidden_size * 3; },
+        "Wrong multihead_project_bias_qkv_size !");
+    idx += _hidden_size * 3;
+
+    offset.push_back(idx);
+    read_hdf5_dataset_data(
+        hdf5_file, dataset_prefix + "/multihead_project_kernel_output",
+        H5T_NATIVE_UCHAR, value_i8.data() + idx,
+        [=](int size) { return size != _hidden_size * _hidden_size; },
+        "Wrong multihead_project_kernel_output_size !");
+    read_hdf5_dataset_scalar(
+        hdf5_file, dataset_prefix + "/multihead_project_kernel_output_clip_max",
+        H5T_NATIVE_FLOAT, &clip_max);
+    dequantize_array(value_i8, value, clip_max, _quant_range, idx,
+                     _hidden_size * _hidden_size);
+    _enc_clip_max.push_back(clip_max);
+    idx += _hidden_size * _hidden_size;
+
+    offset.push_back(idx);
+    read_hdf5_dataset_data(
+        hdf5_file, dataset_prefix + "/multihead_project_bias_output",
+        H5T_NATIVE_FLOAT, value.data() + idx,
+        [=](int size) { return size != _hidden_size; },
+        "Wrong multihead_project_bias_output_size !");
+    idx += _hidden_size;
+
+    offset.push_back(idx);
+    read_hdf5_dataset_data(
+        hdf5_file, dataset_prefix + "/ffn_norm_scale", H5T_NATIVE_FLOAT,
+        value.data() + idx, [=](int size) { return size != _hidden_size; },
+        "Wrong ffn_norm_scale_size !");
+    idx += _hidden_size;
+
+    offset.push_back(idx);
+    read_hdf5_dataset_data(
+        hdf5_file, dataset_prefix + "/ffn_norm_bias", H5T_NATIVE_FLOAT,
+        value.data() + idx, [=](int size) { return size != _hidden_size; },
+        "Wrong ffn_norm_bias_size !");
+    idx += _hidden_size;
+
+    offset.push_back(idx);
+    read_hdf5_dataset_data(
+        hdf5_file, dataset_prefix + "/ffn_first_kernel", H5T_NATIVE_UCHAR,
+        value_i8.data() + idx,
+        [=](int size) { return size != _hidden_size * _inner_size; },
+        "Wrong ffn_first_kernel_size !");
+    read_hdf5_dataset_scalar(hdf5_file,
+                             dataset_prefix + "/ffn_first_kernel_clip_max",
+                             H5T_NATIVE_FLOAT, &clip_max);
+    dequantize_array(value_i8, value, clip_max, _quant_range, idx,
+                     _hidden_size * _inner_size);
+    _enc_clip_max.push_back(clip_max);
+    idx += _hidden_size * _inner_size;
+
+    offset.push_back(idx);
+    read_hdf5_dataset_data(
+        hdf5_file, dataset_prefix + "/ffn_first_bias", H5T_NATIVE_FLOAT,
+        value.data() + idx, [=](int size) { return size != _inner_size; },
+        "Wrong ffn_first_bias_size !");
+    idx += _inner_size;
+
+    offset.push_back(idx);
+    read_hdf5_dataset_data(
+        hdf5_file, dataset_prefix + "/ffn_second_kernel", H5T_NATIVE_UCHAR,
+        value_i8.data() + idx,
+        [=](int size) { return size != _hidden_size * _inner_size; },
+        "Wrong ffn_second_kernel_size !");
+    read_hdf5_dataset_scalar(hdf5_file,
+                             dataset_prefix + "/ffn_second_kernel_clip_max",
+                             H5T_NATIVE_FLOAT, &clip_max);
+    dequantize_array(value_i8, value, clip_max, _quant_range, idx,
+                     _hidden_size * _inner_size);
+    _enc_clip_max.push_back(clip_max);
+    idx += _hidden_size * _inner_size;
+
+    offset.push_back(idx);
+    read_hdf5_dataset_data(
+        hdf5_file, dataset_prefix + "/ffn_second_bias", H5T_NATIVE_FLOAT,
+        value.data() + idx, [=](int size) { return size != _hidden_size; },
+        "Wrong ffn_second_bias_size !");
+    idx += _hidden_size;
+
+    read_hdf5_dataset_scalar(hdf5_file,
+                             dataset_prefix + "/multihead_ln_clip_max",
+                             H5T_NATIVE_FLOAT, &clip_max);
+    _enc_clip_max.push_back(clip_max);
+    read_hdf5_dataset_scalar(
+        hdf5_file, dataset_prefix + "/multihead_project_output_clip_max",
+        H5T_NATIVE_FLOAT, &clip_max);
+    _enc_clip_max.push_back(clip_max);
+    read_hdf5_dataset_scalar(hdf5_file, dataset_prefix + "/ffn_ln_clip_max",
+                             H5T_NATIVE_FLOAT, &clip_max);
+    _enc_clip_max.push_back(clip_max);
+    read_hdf5_dataset_scalar(hdf5_file,
+                             dataset_prefix + "/ffn_first_act_clip_max",
+                             H5T_NATIVE_FLOAT, &clip_max);
+    _enc_clip_max.push_back(clip_max);
+    read_hdf5_dataset_scalar(hdf5_file,
+                             dataset_prefix + "/multihead_qkv_dense_clip_max",
+                             H5T_NATIVE_FLOAT, &clip_max);
+    _enc_clip_max.push_back(clip_max);
+    read_hdf5_dataset_scalar(
+        hdf5_file, dataset_prefix + "/multihead_output_dense_clip_max",
+        H5T_NATIVE_FLOAT, &clip_max);
+    _enc_clip_max.push_back(clip_max);
+    read_hdf5_dataset_scalar(hdf5_file,
+                             dataset_prefix + "/ffn_first_output_clip_max",
+                             H5T_NATIVE_FLOAT, &clip_max);
+    _enc_clip_max.push_back(clip_max);
+  }  // for
+
+  std::vector<_DataType> raw_value;
+  raw_value.reserve(value.size());
+  for (float e : value) raw_value.push_back(float2required(e));
+  _d_enc_wei = raw_value;
+
+  for (int e : offset) _p_d_enc_wei.push_back(_d_enc_wei.data() + e);
+  std::cout << "Finish loading enc_wei from host to device" << std::endl;
+}
+
+/**
+Load the proto file into CPU memory and parse it.
+*/
+template <OperationType OpType_>
+std::string QuantBertWeight<OpType_>::initializing(std::string weight_path) {
+  if (endswith(weight_path, ".pb")) {
+    std::cout << "Parsing protobuf: " << weight_path << std::endl;
+    QuantBert bert;
+    // Verify that the version of the library that we linked against is
+    // compatible with the version of the headers we compiled against.
+    GOOGLE_PROTOBUF_VERIFY_VERSION;
+
+    std::fstream raw_input(weight_path, std::ios::in | std::ios::binary);
+    if (!bert.ParseFromIstream(&raw_input)) {
+      return "Parse weights from [" + weight_path + "] failed.";
+    }
+
+    proto_get_model_config(bert);
+    if (_hidden_size % 4 != 0) {
+      return "hidden_size should be a multiple of 4 to avoid misaligned "
+             "address in CUDA";
+    }
+
+    std::string res = proto_parse_emb_wei(bert.src_embedding());
+    if (!res.empty()) return res;
+
+    res = proto_parse_enc_wei(bert);
+    if (!res.empty()) return res;
+
+    std::cout << "finish initializing all weight from host to device"
+              << std::endl;
+    // Optional:  Delete all global objects allocated by libprotobuf.
+    // google::protobuf::ShutdownProtobufLibrary();
+    return "";
+  } else if (endswith(weight_path, ".hdf5")) {
+    std::cout << "Parsing hdf5: " << weight_path << std::endl;
+
+    hid_t hdf5_file = H5Fopen(weight_path.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT);
+    if (hdf5_file < 0) {
+      return "Unable to read HDF5 file from " + weight_path;
+    }
+    hdf5_get_model_config(hdf5_file);
+    if (_hidden_size % 4 != 0) {
+      return "hidden_size should be a multiple of 4 to avoid misaligned "
+             "address in CUDA";
+    }
+    // hdf5_parse_* would throw std::runtime_error on error
+    hdf5_parse_emb_wei(hdf5_file);
+    hdf5_parse_enc_wei(hdf5_file);
+    H5Fclose(hdf5_file);
+
+    std::cout << "Finish loading all weight from host to device" << std::endl;
+    return "";
+  } else {
+    return "Unsupported weight extention for [" + weight_path +
+           "]; Supported extensions: .pb, .hdf5\n";
+  }
+}
+
+template class QuantBertWeight<OperationType::FP16>;
+template class QuantBertWeight<OperationType::FP32>;
+
+}  // namespace cuda
+}  // namespace lightseq
diff --git a/lightseq/inference/proto/quant_bert_weight.h b/lightseq/inference/proto/quant_bert_weight.h
new file mode 100644
index 00000000..aa92ae16
--- /dev/null
+++ b/lightseq/inference/proto/quant_bert_weight.h
@@ -0,0 +1,102 @@
+#pragma once
+
+#include <fcntl.h>
+#include <google/protobuf/io/zero_copy_stream.h>
+#include <google/protobuf/io/zero_copy_stream_impl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "quant_bert.pb.h"
+#include "../tools/util.h"
+
+namespace lightseq {
+namespace cuda {
+
+/*
+Load the model weights which stored in custom proto file into GPU memory.
+*/
+template <OperationType OpType_>
+class QuantBertWeight {
+ private:
+  typedef OperationTypeTraits<OpType_> _optraits;
+  typedef typename _optraits::DataType _DataType;
+  _DataType float2required(float value);
+  void proto_get_model_config(const QuantBert &bert);
+  std::string proto_parse_emb_wei(const QuantBertEmbeddingLayer &layer);
+  std::string proto_parse_enc_wei(const QuantBert &bert);
+
+  void hdf5_get_model_config(hid_t hdf5_file);
+  void hdf5_parse_emb_wei(hid_t hdf5_file);
+  void hdf5_parse_enc_wei(hid_t hdf5_file);
+  // store the weights pointer
+  std::vector<const _DataType *> _p_d_src_emb_wei;  // size: 4
+  std::vector<const _DataType *> _p_d_enc_wei;      // size: 12 * enc_layer_num
+
+  // store the weights on cpu memory
+  std::vector<_DataType> _d_src_emb_wei;
+  std::vector<_DataType> _d_enc_wei;
+
+  // store the clip_max of weights and activations
+  float _src_emb_clip_max;
+  std::vector<float> _enc_clip_max;  // size: 11 * enc_layer_num
+
+ public:
+  std::string initializing(std::string proto_path);
+
+  const std::vector<const _DataType *> &get_src_emb_wei() const {
+    // {token_emb, pos_emb, norm_scale, norm_bias}
+    return _p_d_src_emb_wei;
+  }
+
+  const std::vector<const _DataType *> &get_enc_wei() const {
+    // {multihead_norm_scale, multihead_norm_bias, multihead_qkv_kernel,
+    // multihead_qkv_bias multihead_output_kernel, multihead_output_bias
+    // ffn_norm_scale, ffn_norm_bias}
+    // ffn_first_kernel, ffn_first_bias, ffn_second_kernel, ffn_second_bias} *
+    // encoder_layer_num
+    return _p_d_enc_wei;
+  }
+
+  float get_src_emb_clip_max() const { return _src_emb_clip_max; }
+
+  std::vector<float> get_enc_clip_max() const { return _enc_clip_max; }
+
+  int _hidden_size;
+  int _inner_size;
+  int _max_step;
+  int _src_vocab_size;
+  int _n_enc_layer;  // number of encoder layer
+  int _dim_per_head;
+  int _weight_per_enc_layer;  // 12
+
+  int _head_num;
+  int _padding_id;  // for src
+  bool _is_post_ln;
+  bool _use_gelu;
+  int _multilg_type;
+
+  const float _quant_range = 127;
+
+  void print_model_config() {
+    std::cout << "***model config***" << std::endl;
+    std::cout << "encoder layers: " << _n_enc_layer << std::endl;
+    std::cout << "hidden size: " << _hidden_size << std::endl;
+    std::cout << "inner size: " << _inner_size << std::endl;
+    std::cout << "head number: " << _head_num << std::endl;
+    std::cout << "dim per head: " << _dim_per_head << std::endl;
+    std::cout << "src vocab size: " << _src_vocab_size << std::endl;
+    std::cout << "is_post_ln: " << _is_post_ln << std::endl;
+    std::cout << "use_gelu: " << _use_gelu << std::endl;
+    std::cout << "padding_id: " << _padding_id << std::endl;
+    std::cout << std::endl;
+  }
+};
+
+}  // namespace cuda
+}  // namespace lightseq
diff --git a/lightseq/inference/proto/quant_gpt.proto b/lightseq/inference/proto/quant_gpt.proto
new file mode 100644
index 00000000..ba2c63b7
--- /dev/null
+++ b/lightseq/inference/proto/quant_gpt.proto
@@ -0,0 +1,71 @@
+syntax = "proto3";
+option optimize_for = LITE_RUNTIME;
+// all the matrix are stored in row-major order,
+// plz see https://en.wikipedia.org/wiki/Row-_and_column-major_order for details
+
+// the definition of "Multi-Head Attention", "Scaled Dot-Product Attention" and
+// "Feed-Forward Networks"
+// plz see https://arxiv.org/abs/1706.03762 for details
+
+message QuantGptEncoderLayer {
+  // decoder-self-attention
+  repeated float multihead_norm_scale = 1;          // [hidden_size]
+  repeated float multihead_norm_bias = 2;           // [hidden_size]
+  bytes multihead_project_kernel_qkv = 3;  // [hidden_size, 3, hidden_size]
+  repeated float multihead_project_bias_qkv = 4;    // [3, hidden_size]
+  bytes multihead_project_kernel_output = 5;  // [hidden_size, hidden_size]
+  repeated float multihead_project_bias_output = 6;    // [hidden_size]
+
+  // "Feed-Forward Networks"
+  repeated float ffn_norm_scale = 7;     // [hidden_size]
+  repeated float ffn_norm_bias = 8;      // [hidden_size]
+  bytes ffn_first_kernel = 9;   // [hidden_size, inner_size]
+  repeated float ffn_first_bias = 10;     // [inner_size]
+  bytes ffn_second_kernel = 11;  // [inner_size, hidden_size]
+  repeated float ffn_second_bias = 12;    // [hidden_size]
+
+  // clip max
+  float multihead_project_kernel_qkv_clip_max = 13;
+  float multihead_project_kernel_output_clip_max = 14;
+  float ffn_first_kernel_clip_max = 15;
+  float ffn_second_kernel_clip_max = 16;
+  float multihead_ln_clip_max = 17;
+  float multihead_project_output_clip_max = 18;
+  float ffn_ln_clip_max = 19;
+  float ffn_first_act_clip_max = 20;
+  float multihead_qkv_dense_clip_max = 21;
+  float multihead_output_dense_clip_max = 22;
+  float ffn_first_output_clip_max = 23;
+  float self_qkv_bias_out_clip_max = 24;
+}
+
+message QuantGptEmbeddingLayer {
+  // token embedding table
+  // for encoder, it is in [src_vocab_size, hidden_size]
+  // so, look it up directly will get the input token embedding
+  bytes token_embedding = 1;
+  repeated float position_embedding = 2;
+  // the last layer_norm of encoder
+  repeated float norm_scale = 3;
+  repeated float norm_bias = 4;
+
+  // clip max
+  float emb_clip_max = 5;
+  float output_ln_clip_max = 6;
+  float logits_clip_max = 7;
+}
+
+message QuantGptModelConf {
+  int32 head_num = 1;
+  int32 src_padding_id = 2;
+  string sampling_method = 3;
+  float topp = 4;
+  int32 topk = 5;
+  int32 eos_id = 6;
+}
+
+message QuantGpt {
+  QuantGptEmbeddingLayer src_embedding = 1;
+  repeated QuantGptEncoderLayer encoder_stack = 2;
+  QuantGptModelConf model_conf = 3;
+}
diff --git a/lightseq/inference/proto/quant_gpt_weight.cc b/lightseq/inference/proto/quant_gpt_weight.cc
new file mode 100644
index 00000000..9da21402
--- /dev/null
+++ b/lightseq/inference/proto/quant_gpt_weight.cc
@@ -0,0 +1,597 @@
+#include "quant_gpt_weight.h"
+
+#include <fstream>
+
+/**
+@file
+Load the model weights which stored in custom proto file into GPU memory.
+Currently, fp16 and fp32 versions are provided.
+Weights in proto file will always be in fp32. For fp16, the weights
+  will be casted from fp32 into fp16
+*/
+
+namespace lightseq {
+namespace cuda {
+
+/**
+Cast weights into required datatype.
+The datatype of weights in custom proto file will always be in fp32.
+*/
+template <>
+float QuantGptWeight<OperationType::FP32>::float2required(float value) {
+  return value;
+}
+
+/**
+fp16 version, cast fp32 into fp16
+*/
+template <>
+__half QuantGptWeight<OperationType::FP16>::float2required(float value) {
+  return __float2half_rn(value);
+}
+
+/**
+Read model config stored in custom proto file.
+*/
+template <OperationType OpType_>
+void QuantGptWeight<OpType_>::proto_get_model_config(const QuantGpt &gpt) {
+  _hidden_size = gpt.src_embedding().norm_scale_size();
+  _inner_size = gpt.encoder_stack()[0].ffn_first_kernel().size() / _hidden_size;
+  _max_step = gpt.src_embedding().position_embedding_size() / _hidden_size;
+  _src_vocab_size = gpt.src_embedding().token_embedding().size() / _hidden_size;
+  _n_enc_layer = gpt.encoder_stack_size();
+  _head_num = gpt.model_conf().head_num();
+  if (_hidden_size % _head_num != 0) {
+    throw std::runtime_error("Wrong head_num: hidden_size " +
+                             std::to_string(_hidden_size) + " % head_num " +
+                             std::to_string(_head_num) + " != 0.");
+  }
+  _dim_per_head = _hidden_size / _head_num;
+  _weight_per_enc_layer = 12;
+  _padding_id = gpt.model_conf().src_padding_id();
+  if (gpt.model_conf().sampling_method() != "") {
+    _sampling_method = gpt.model_conf().sampling_method();
+  }
+  if (gpt.model_conf().topk() != 0) {
+    _topk = gpt.model_conf().topk();
+  }
+  if (gpt.model_conf().topp() != 0.0) {
+    _topp = gpt.model_conf().topp();
+  }
+  if (gpt.model_conf().eos_id() != 0) {
+    _eos_id = gpt.model_conf().eos_id();
+  }
+}
+
+/**
+Load the weights of embedding layer into GPU memory.
+*/
+template <OperationType OpType_>
+std::string QuantGptWeight<OpType_>::proto_parse_emb_wei(
+    const QuantGptEmbeddingLayer &layer) {
+  std::vector<int> offset;
+  std::vector<float> value;
+  int idx = 0;
+
+  offset.push_back(idx);
+  if (layer.token_embedding().size() != _src_vocab_size * _hidden_size)
+    return "wrong token_embedding_size !";
+  for (unsigned char ele : layer.token_embedding())
+    value.push_back(dequantize(ele, _quant_range, layer.emb_clip_max()));
+  idx += _src_vocab_size * _hidden_size;
+  _src_emb_clip_max = layer.emb_clip_max();
+  _output_ln_clip_max = layer.output_ln_clip_max();
+  _logits_clip_max = layer.logits_clip_max();
+
+  offset.push_back(idx);
+  if (layer.position_embedding_size() != _max_step * _hidden_size)
+    return "wrong position_embedding_size !";
+  for (float ele : layer.position_embedding()) value.push_back(ele);
+  idx += _max_step * _hidden_size;
+
+  offset.push_back(idx);
+  if (layer.norm_scale_size() != _hidden_size) return "wrong norm_scale_size !";
+  for (float ele : layer.norm_scale()) value.push_back(ele);
+  idx += _hidden_size;
+
+  offset.push_back(idx);
+  if (layer.norm_bias_size() != _hidden_size) return "wrong norm_bias_size !";
+  for (float ele : layer.norm_bias()) value.push_back(ele);
+  idx += _hidden_size;
+
+  std::vector<_DataType> raw_value;
+  for (float e : value) raw_value.push_back(float2required(e));
+  _d_src_emb_wei = raw_value;
+  for (int e : offset) _p_d_src_emb_wei.push_back(_d_src_emb_wei.data() + e);
+
+  std::cout << "finish initializing emb_wei from host to device" << std::endl;
+  return "";
+}
+
+/**
+Load the weights of encoder into GPU memory.
+*/
+template <OperationType OpType_>
+std::string QuantGptWeight<OpType_>::proto_parse_enc_wei(const QuantGpt &gpt) {
+  std::vector<int> offset;
+  std::vector<float> value;
+  int idx = 0;
+
+  for (auto enc_layer : gpt.encoder_stack()) {
+    offset.push_back(idx);
+    if (enc_layer.multihead_norm_scale_size() != _hidden_size)
+      return "wrong multihead_norm_scale_size !";
+    for (float ele : enc_layer.multihead_norm_scale()) value.push_back(ele);
+    idx += _hidden_size;
+
+    offset.push_back(idx);
+    if (enc_layer.multihead_norm_bias_size() != _hidden_size)
+      return "wrong multihead_norm_bias_size !";
+    for (float ele : enc_layer.multihead_norm_bias()) value.push_back(ele);
+    idx += _hidden_size;
+
+    offset.push_back(idx);
+    if (enc_layer.multihead_project_kernel_qkv().size() !=
+        _hidden_size * _hidden_size * 3)
+      return "wrong multihead_project_kernel_qkv_size !";
+    for (unsigned char ele : enc_layer.multihead_project_kernel_qkv())
+      value.push_back(
+          dequantize(ele, _quant_range,
+                     enc_layer.multihead_project_kernel_qkv_clip_max()));
+    idx += _hidden_size * _hidden_size * 3;
+
+    offset.push_back(idx);
+    if (enc_layer.multihead_project_bias_qkv_size() != _hidden_size * 3)
+      return "wrong multihead_project_bias_qkv_size !";
+    for (float ele : enc_layer.multihead_project_bias_qkv())
+      value.push_back(ele);
+    idx += _hidden_size * 3;
+
+    offset.push_back(idx);
+    if (enc_layer.multihead_project_kernel_output().size() !=
+        _hidden_size * _hidden_size)
+      return "wrong multihead_project_kernel_output_size !";
+    for (unsigned char ele : enc_layer.multihead_project_kernel_output())
+      value.push_back(
+          dequantize(ele, _quant_range,
+                     enc_layer.multihead_project_kernel_output_clip_max()));
+    idx += _hidden_size * _hidden_size;
+
+    offset.push_back(idx);
+    if (enc_layer.multihead_project_bias_output_size() != _hidden_size)
+      return "wrong multihead_project_bias_output_size !";
+    for (float ele : enc_layer.multihead_project_bias_output())
+      value.push_back(ele);
+    idx += _hidden_size;
+
+    offset.push_back(idx);
+    if (enc_layer.ffn_norm_scale_size() != _hidden_size)
+      return "wrong ffn_norm_scale_size !";
+    for (float ele : enc_layer.ffn_norm_scale()) value.push_back(ele);
+    idx += _hidden_size;
+
+    offset.push_back(idx);
+    if (enc_layer.ffn_norm_bias_size() != _hidden_size)
+      return "wrong ffn_norm_bias_size !";
+    for (float ele : enc_layer.ffn_norm_bias()) value.push_back(ele);
+    idx += _hidden_size;
+
+    offset.push_back(idx);
+    if (enc_layer.ffn_first_kernel().size() != _hidden_size * _inner_size)
+      return "wrong ffn_first_kernel_size !";
+    for (unsigned char ele : enc_layer.ffn_first_kernel())
+      value.push_back(
+          dequantize(ele, _quant_range, enc_layer.ffn_first_kernel_clip_max()));
+    idx += _hidden_size * _inner_size;
+
+    offset.push_back(idx);
+    if (enc_layer.ffn_first_bias_size() != _inner_size)
+      return "wrong ffn_first_bias_size !";
+    for (float ele : enc_layer.ffn_first_bias()) value.push_back(ele);
+    idx += _inner_size;
+
+    offset.push_back(idx);
+    if (enc_layer.ffn_second_kernel().size() != _hidden_size * _inner_size)
+      return "wrong ffn_second_kernel_size !";
+    for (unsigned char ele : enc_layer.ffn_second_kernel())
+      value.push_back(dequantize(ele, _quant_range,
+                                 enc_layer.ffn_second_kernel_clip_max()));
+    idx += _hidden_size * _inner_size;
+
+    offset.push_back(idx);
+    if (enc_layer.ffn_second_bias_size() != _hidden_size)
+      return "wrong ffn_second_bias_size !";
+    for (float ele : enc_layer.ffn_second_bias()) value.push_back(ele);
+    idx += _hidden_size;
+
+    _enc_clip_max.push_back(enc_layer.multihead_project_kernel_qkv_clip_max());
+    _enc_clip_max.push_back(
+        enc_layer.multihead_project_kernel_output_clip_max());
+    _enc_clip_max.push_back(enc_layer.ffn_first_kernel_clip_max());
+    _enc_clip_max.push_back(enc_layer.ffn_second_kernel_clip_max());
+    _enc_clip_max.push_back(enc_layer.multihead_ln_clip_max());
+    _enc_clip_max.push_back(enc_layer.multihead_project_output_clip_max());
+    _enc_clip_max.push_back(enc_layer.ffn_ln_clip_max());
+    _enc_clip_max.push_back(enc_layer.ffn_first_act_clip_max());
+    _enc_clip_max.push_back(enc_layer.multihead_qkv_dense_clip_max());
+    _enc_clip_max.push_back(enc_layer.multihead_output_dense_clip_max());
+    _enc_clip_max.push_back(enc_layer.ffn_first_output_clip_max());
+    _enc_clip_max.push_back(enc_layer.self_qkv_bias_out_clip_max());
+
+  }  // for
+
+  std::vector<_DataType> raw_value;
+  for (float e : value) raw_value.push_back(float2required(e));
+  _d_enc_wei = raw_value;
+
+  for (int e : offset) _p_d_enc_wei.push_back(_d_enc_wei.data() + e);
+  std::cout << "finish initializing enc_wei from host to device" << std::endl;
+  return "";
+}
+
+/**
+Read model config stored in custom hdf5 file.
+*/
+template <OperationType OpType_>
+void QuantGptWeight<OpType_>::hdf5_get_model_config(hid_t hdf5_file) {
+  _hidden_size = get_hdf5_dataset_size(hdf5_file, "src_embedding/norm_scale");
+
+  _inner_size =
+      get_hdf5_dataset_size(hdf5_file, "encoder_stack/0/ffn_first_kernel") /
+      _hidden_size;
+
+  _max_step =
+      get_hdf5_dataset_size(hdf5_file, "src_embedding/position_embedding") /
+      _hidden_size;
+
+  _src_vocab_size =
+      get_hdf5_dataset_size(hdf5_file, "src_embedding/token_embedding") /
+      _hidden_size;
+
+  read_hdf5_dataset_scalar(hdf5_file, "model_conf/n_encoder_stack",
+                           H5T_NATIVE_INT, &_n_enc_layer);
+
+  read_hdf5_dataset_scalar(hdf5_file, "model_conf/head_num", H5T_NATIVE_INT,
+                           &_head_num);
+
+  _dim_per_head = _hidden_size / _head_num;
+
+  _weight_per_enc_layer = 12;
+
+  read_hdf5_dataset_scalar(hdf5_file, "model_conf/src_padding_id",
+                           H5T_NATIVE_INT, &_padding_id);
+
+  // special handling for string reading
+  // string were converted to numpy array of np.int8 in python
+  // hence needed to be read as an char array here
+  char _sampling_method_buf[128];  // get 128 character for sampling method
+  int _sampling_method_strlen = read_hdf5_dataset_data(
+      hdf5_file, "model_conf/sampling_method", H5T_NATIVE_CHAR,
+      _sampling_method_buf, [](int size) { return size > 128; },
+      "Expect model_conf/sampling_method to have less than 128 characters.");
+  std::string _sampling_method_read =
+      std::string(_sampling_method_buf, _sampling_method_strlen);
+  if (_sampling_method_read != "") {
+    _sampling_method = _sampling_method_read;
+  }
+
+  int _topk_read;
+  read_hdf5_dataset_scalar(hdf5_file, "model_conf/topk", H5T_NATIVE_INT,
+                           &_topk_read);
+  if (_topk_read != 0) {
+    _topk = _topk_read;
+  }
+
+  float _topp_read;
+  read_hdf5_dataset_scalar(hdf5_file, "model_conf/topp", H5T_NATIVE_FLOAT,
+                           &_topp_read);
+  if (_topp_read != 0.0) {
+    _topp = _topp_read;
+  }
+
+  int _eos_id_read;
+  read_hdf5_dataset_scalar(hdf5_file, "model_conf/eos_id", H5T_NATIVE_INT,
+                           &_eos_id_read);
+  if (_eos_id_read != 0) {
+    _eos_id = _eos_id_read;
+  }
+}
+
+/**
+Load the weights of embedding layer into GPU memory.
+*/
+template <OperationType OpType_>
+void QuantGptWeight<OpType_>::hdf5_parse_emb_wei(hid_t hdf5_file) {
+  std::string dataset_prefix = "src_embedding";
+  size_t value_size = _src_vocab_size * _hidden_size +
+                      _max_step * _hidden_size + _hidden_size * 2;
+
+  std::vector<int> offset;
+  std::vector<float> value(value_size);  // preallocate vector for performance
+  std::vector<unsigned char> value_i8(value_size);
+  std::cout << "loading " << value_size * sizeof(OpType_) / (1024 * 1024)
+            << " MB of embedding weight." << std::endl;
+  int idx = 0;
+
+  offset.push_back(idx);
+  read_hdf5_dataset_data(
+      hdf5_file, dataset_prefix + "/token_embedding", H5T_NATIVE_UCHAR,
+      value_i8.data() + idx,
+      [=](int size) { return size != _src_vocab_size * _hidden_size; },
+      "Wrong token_embedding_size !");
+  read_hdf5_dataset_scalar(hdf5_file, dataset_prefix + "/emb_clip_max",
+                           H5T_NATIVE_FLOAT, &_src_emb_clip_max);
+  dequantize_array(value_i8, value, _src_emb_clip_max, _quant_range, idx,
+                   _src_vocab_size * _hidden_size);
+  read_hdf5_dataset_scalar(hdf5_file, dataset_prefix + "/output_ln_clip_max",
+                           H5T_NATIVE_FLOAT, &_output_ln_clip_max);
+  read_hdf5_dataset_scalar(hdf5_file, dataset_prefix + "/logits_clip_max",
+                           H5T_NATIVE_FLOAT, &_logits_clip_max);
+  idx += _src_vocab_size * _hidden_size;
+
+  offset.push_back(idx);
+  read_hdf5_dataset_data(
+      hdf5_file, dataset_prefix + "/position_embedding", H5T_NATIVE_FLOAT,
+      value.data() + idx,
+      [=](int size) { return size != _max_step * _hidden_size; },
+      "Wrong position_embedding_size !");
+  idx += _max_step * _hidden_size;
+
+  offset.push_back(idx);
+  read_hdf5_dataset_data(
+      hdf5_file, dataset_prefix + "/norm_scale", H5T_NATIVE_FLOAT,
+      value.data() + idx, [=](int size) { return size != _hidden_size; },
+      "Wrong norm_scale_size !");
+  idx += _hidden_size;
+
+  offset.push_back(idx);
+  read_hdf5_dataset_data(
+      hdf5_file, dataset_prefix + "/norm_bias", H5T_NATIVE_FLOAT,
+      value.data() + idx, [=](int size) { return size != _hidden_size; },
+      "Wrong norm_bias_size !");
+  idx += _hidden_size;
+
+  std::vector<_DataType> raw_value;
+  raw_value.reserve(value.size());
+  for (float e : value) raw_value.push_back(float2required(e));
+  _d_src_emb_wei = raw_value;
+  for (int e : offset) _p_d_src_emb_wei.push_back(_d_src_emb_wei.data() + e);
+
+  std::cout << "finish initializing emb_wei from host to device" << std::endl;
+}
+
+/**
+Load the weights of encoder into GPU memory.
+*/
+template <OperationType OpType_>
+void QuantGptWeight<OpType_>::hdf5_parse_enc_wei(hid_t hdf5_file) {
+  size_t value_size =
+      (_hidden_size * 2 + _hidden_size * _hidden_size * 3 + _hidden_size * 3 +
+       _hidden_size * _hidden_size + _hidden_size * 3 +
+       _hidden_size * _inner_size + _inner_size + _hidden_size * _inner_size +
+       _hidden_size) *
+      _n_enc_layer;
+  std::vector<int> offset;
+  std::vector<float> value(value_size);
+  std::vector<unsigned char> value_i8(value_size);
+  std::cout << "loading " << value_size * sizeof(OpType_) / (1024 * 1024)
+            << " MB of encoder weight." << std::endl;
+
+  float clip_max;
+  int idx = 0;
+  for (int layer_id = 0; layer_id < _n_enc_layer; ++layer_id) {
+    std::string dataset_prefix = "encoder_stack/" + std::to_string(layer_id);
+
+    offset.push_back(idx);
+    read_hdf5_dataset_data(
+        hdf5_file, dataset_prefix + "/multihead_norm_scale", H5T_NATIVE_FLOAT,
+        value.data() + idx, [=](int size) { return size != _hidden_size; },
+        "Wrong multihead_norm_scale_size !");
+    idx += _hidden_size;
+
+    offset.push_back(idx);
+    read_hdf5_dataset_data(
+        hdf5_file, dataset_prefix + "/multihead_norm_bias", H5T_NATIVE_FLOAT,
+        value.data() + idx, [=](int size) { return size != _hidden_size; },
+        "Wrong multihead_norm_bias_size !");
+    idx += _hidden_size;
+
+    offset.push_back(idx);
+    read_hdf5_dataset_data(
+        hdf5_file, dataset_prefix + "/multihead_project_kernel_qkv",
+        H5T_NATIVE_UCHAR, value_i8.data() + idx,
+        [=](int size) { return size != _hidden_size * _hidden_size * 3; },
+        "Wrong multihead_project_kernel_qkv_size !");
+    read_hdf5_dataset_scalar(
+        hdf5_file, dataset_prefix + "/multihead_project_kernel_qkv_clip_max",
+        H5T_NATIVE_FLOAT, &clip_max);
+    dequantize_array(value_i8, value, clip_max, _quant_range, idx,
+                     _hidden_size * _hidden_size * 3);
+    _enc_clip_max.push_back(clip_max);
+    idx += _hidden_size * _hidden_size * 3;
+
+    offset.push_back(idx);
+    read_hdf5_dataset_data(
+        hdf5_file, dataset_prefix + "/multihead_project_bias_qkv",
+        H5T_NATIVE_FLOAT, value.data() + idx,
+        [=](int size) { return size != _hidden_size * 3; },
+        "Wrong multihead_project_bias_qkv_size !");
+    idx += _hidden_size * 3;
+
+    offset.push_back(idx);
+    read_hdf5_dataset_data(
+        hdf5_file, dataset_prefix + "/multihead_project_kernel_output",
+        H5T_NATIVE_UCHAR, value_i8.data() + idx,
+        [=](int size) { return size != _hidden_size * _hidden_size; },
+        "Wrong multihead_project_kernel_output_size !");
+    read_hdf5_dataset_scalar(
+        hdf5_file, dataset_prefix + "/multihead_project_kernel_output_clip_max",
+        H5T_NATIVE_FLOAT, &clip_max);
+    dequantize_array(value_i8, value, clip_max, _quant_range, idx,
+                     _hidden_size * _hidden_size);
+    _enc_clip_max.push_back(clip_max);
+    idx += _hidden_size * _hidden_size;
+
+    offset.push_back(idx);
+    read_hdf5_dataset_data(
+        hdf5_file, dataset_prefix + "/multihead_project_bias_output",
+        H5T_NATIVE_FLOAT, value.data() + idx,
+        [=](int size) { return size != _hidden_size; },
+        "Wrong multihead_project_bias_output_size !");
+    idx += _hidden_size;
+
+    offset.push_back(idx);
+    read_hdf5_dataset_data(
+        hdf5_file, dataset_prefix + "/ffn_norm_scale", H5T_NATIVE_FLOAT,
+        value.data() + idx, [=](int size) { return size != _hidden_size; },
+        "Wrong ffn_norm_scale_size !");
+    idx += _hidden_size;
+
+    offset.push_back(idx);
+    read_hdf5_dataset_data(
+        hdf5_file, dataset_prefix + "/ffn_norm_bias", H5T_NATIVE_FLOAT,
+        value.data() + idx, [=](int size) { return size != _hidden_size; },
+        "Wrong ffn_norm_bias_size !");
+    idx += _hidden_size;
+
+    offset.push_back(idx);
+    read_hdf5_dataset_data(
+        hdf5_file, dataset_prefix + "/ffn_first_kernel", H5T_NATIVE_UCHAR,
+        value_i8.data() + idx,
+        [=](int size) { return size != _hidden_size * _inner_size; },
+        "Wrong ffn_first_kernel_size !");
+    read_hdf5_dataset_scalar(hdf5_file,
+                             dataset_prefix + "/ffn_first_kernel_clip_max",
+                             H5T_NATIVE_FLOAT, &clip_max);
+    dequantize_array(value_i8, value, clip_max, _quant_range, idx,
+                     _hidden_size * _inner_size);
+    _enc_clip_max.push_back(clip_max);
+    idx += _hidden_size * _inner_size;
+
+    offset.push_back(idx);
+    read_hdf5_dataset_data(
+        hdf5_file, dataset_prefix + "/ffn_first_bias", H5T_NATIVE_FLOAT,
+        value.data() + idx, [=](int size) { return size != _inner_size; },
+        "Wrong ffn_first_bias_size !");
+    idx += _inner_size;
+
+    offset.push_back(idx);
+    read_hdf5_dataset_data(
+        hdf5_file, dataset_prefix + "/ffn_second_kernel", H5T_NATIVE_UCHAR,
+        value_i8.data() + idx,
+        [=](int size) { return size != _hidden_size * _inner_size; },
+        "Wrong ffn_second_kernel_size !");
+    read_hdf5_dataset_scalar(hdf5_file,
+                             dataset_prefix + "/ffn_second_kernel_clip_max",
+                             H5T_NATIVE_FLOAT, &clip_max);
+    dequantize_array(value_i8, value, clip_max, _quant_range, idx,
+                     _hidden_size * _inner_size);
+    _enc_clip_max.push_back(clip_max);
+    idx += _hidden_size * _inner_size;
+
+    offset.push_back(idx);
+    read_hdf5_dataset_data(
+        hdf5_file, dataset_prefix + "/ffn_second_bias", H5T_NATIVE_FLOAT,
+        value.data() + idx, [=](int size) { return size != _hidden_size; },
+        "Wrong ffn_second_bias_size !");
+    idx += _hidden_size;
+
+    read_hdf5_dataset_scalar(hdf5_file,
+                             dataset_prefix + "/multihead_ln_clip_max",
+                             H5T_NATIVE_FLOAT, &clip_max);
+    _enc_clip_max.push_back(clip_max);
+    read_hdf5_dataset_scalar(
+        hdf5_file, dataset_prefix + "/multihead_project_output_clip_max",
+        H5T_NATIVE_FLOAT, &clip_max);
+    _enc_clip_max.push_back(clip_max);
+    read_hdf5_dataset_scalar(hdf5_file, dataset_prefix + "/ffn_ln_clip_max",
+                             H5T_NATIVE_FLOAT, &clip_max);
+    _enc_clip_max.push_back(clip_max);
+    read_hdf5_dataset_scalar(hdf5_file,
+                             dataset_prefix + "/ffn_first_act_clip_max",
+                             H5T_NATIVE_FLOAT, &clip_max);
+    _enc_clip_max.push_back(clip_max);
+    read_hdf5_dataset_scalar(hdf5_file,
+                             dataset_prefix + "/multihead_qkv_dense_clip_max",
+                             H5T_NATIVE_FLOAT, &clip_max);
+    _enc_clip_max.push_back(clip_max);
+    read_hdf5_dataset_scalar(
+        hdf5_file, dataset_prefix + "/multihead_output_dense_clip_max",
+        H5T_NATIVE_FLOAT, &clip_max);
+    _enc_clip_max.push_back(clip_max);
+    read_hdf5_dataset_scalar(hdf5_file,
+                             dataset_prefix + "/ffn_first_output_clip_max",
+                             H5T_NATIVE_FLOAT, &clip_max);
+    _enc_clip_max.push_back(clip_max);
+    read_hdf5_dataset_scalar(hdf5_file,
+                             dataset_prefix + "/self_qkv_bias_out_clip_max",
+                             H5T_NATIVE_FLOAT, &clip_max);
+    _enc_clip_max.push_back(clip_max);
+  }  // for
+
+  std::vector<_DataType> raw_value;
+  for (float e : value) raw_value.push_back(float2required(e));
+  _d_enc_wei = raw_value;
+
+  for (int e : offset) _p_d_enc_wei.push_back(_d_enc_wei.data() + e);
+  std::cout << "finish initializing enc_wei from host to device" << std::endl;
+}
+
+/**
+Load the proto file into CPU memory and parse it.
+*/
+template <OperationType OpType_>
+std::string QuantGptWeight<OpType_>::initializing(std::string weight_path) {
+  // If weight is of type pb, parse using proto parser.
+  if (endswith(weight_path, ".pb")) {
+    std::cout << "Parsing protobuf: " << weight_path << std::endl;
+    QuantGpt gpt;
+    // Verify that the version of the library that we linked against is
+    // compatible with the version of the headers we compiled against.
+    GOOGLE_PROTOBUF_VERIFY_VERSION;
+
+    std::fstream raw_input(weight_path, std::ios::in | std::ios::binary);
+    if (!gpt.ParseFromIstream(&raw_input)) {
+      return "Parse weights from [" + weight_path + "] failed.";
+    }
+
+    proto_get_model_config(gpt);
+
+    std::string res = proto_parse_emb_wei(gpt.src_embedding());
+    if (!res.empty()) return res;
+
+    res = proto_parse_enc_wei(gpt);
+    if (!res.empty()) return res;
+
+    std::cout << "finish initializing all weight from host to device"
+              << std::endl;
+    // Optional:  Delete all global objects allocated by libprotobuf.
+    // google::protobuf::ShutdownProtobufLibrary();
+    return "";
+  } else if (endswith(weight_path, ".hdf5")) {
+    std::cout << "Parsing hdf5: " << weight_path << std::endl;
+
+    hid_t hdf5_file = H5Fopen(weight_path.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT);
+    if (hdf5_file < 0) {
+      return "Unable to read HDF5 file from " + weight_path;
+    }
+    hdf5_get_model_config(hdf5_file);
+
+    // hdf5_parse_* would throw std::runtime_error on error
+    hdf5_parse_emb_wei(hdf5_file);
+    hdf5_parse_enc_wei(hdf5_file);
+    H5Fclose(hdf5_file);
+
+    std::cout << "Finish loading all weight from host to device" << std::endl;
+    return "";
+  } else {
+    return "Unsupported weight extention for [" + weight_path +
+           "]; Supported extensions: .pb, .hdf5\n";
+  }
+}
+
+template class QuantGptWeight<OperationType::FP16>;
+template class QuantGptWeight<OperationType::FP32>;
+
+}  // namespace cuda
+}  // namespace lightseq
diff --git a/lightseq/inference/proto/quant_gpt_weight.h b/lightseq/inference/proto/quant_gpt_weight.h
new file mode 100644
index 00000000..748b566b
--- /dev/null
+++ b/lightseq/inference/proto/quant_gpt_weight.h
@@ -0,0 +1,99 @@
+#pragma once
+
+#include <fcntl.h>
+#include <google/protobuf/io/zero_copy_stream.h>
+#include <google/protobuf/io/zero_copy_stream_impl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "quant_gpt.pb.h"
+#include "../tools/util.h"
+
+namespace lightseq {
+namespace cuda {
+
+/*
+Load the model weights which stored in custom proto file into GPU memory.
+*/
+template <OperationType OpType_>
+class QuantGptWeight {
+ private:
+  typedef OperationTypeTraits<OpType_> _optraits;
+  typedef typename _optraits::DataType _DataType;
+
+  _DataType float2required(float value);
+
+  void proto_get_model_config(const QuantGpt &gpt);
+  std::string proto_parse_emb_wei(const QuantGptEmbeddingLayer &layer);
+  std::string proto_parse_enc_wei(const QuantGpt &gpt);
+
+  // parsing function for hdf5
+  void hdf5_get_model_config(hid_t hdf5_file);
+  void hdf5_parse_emb_wei(hid_t hdf5_file);
+  void hdf5_parse_enc_wei(hid_t hdf5_file);
+
+  // store the weights pointer
+  std::vector<const _DataType *> _p_d_src_emb_wei;  // size: 4
+  std::vector<const _DataType *> _p_d_enc_wei;      // size: 12 * enc_layer_num
+
+  // store the weights on gpu memory
+  std::vector<_DataType> _d_src_emb_wei;
+  std::vector<_DataType> _d_enc_wei;
+
+  // store the clip_max of weights and activations
+  float _src_emb_clip_max;
+  float _output_ln_clip_max;
+  float _logits_clip_max;
+  std::vector<float> _enc_clip_max;  // size: 11 * enc_layer_num
+
+ public:
+  std::string initializing(std::string weight_path);
+
+  const std::vector<const _DataType *> &get_src_emb_wei() const {
+    // {token_emb, pos_emb, norm_scale, norm_bias}
+    return _p_d_src_emb_wei;
+  }
+
+  const std::vector<const _DataType *> &get_enc_wei() const {
+    // {multihead_norm_scale, multihead_norm_bias, multihead_qkv_kernel,
+    // multihead_qkv_bias multihead_output_kernel, multihead_output_bias
+    // ffn_norm_scale, ffn_norm_bias}
+    // ffn_first_kernel, ffn_first_bias, ffn_second_kernel, ffn_second_bias} *
+    // encoder_layer_num
+    return _p_d_enc_wei;
+  }
+
+  float get_src_emb_clip_max() const { return _src_emb_clip_max; }
+
+  float get_output_ln_clip_max() const { return _output_ln_clip_max; }
+
+  float get_logits_clip_max() const { return _logits_clip_max; }
+
+  std::vector<float> get_enc_clip_max() const { return _enc_clip_max; }
+
+  const float _quant_range = 127;
+
+  int _hidden_size;
+  int _inner_size;
+  int _max_step;
+  int _src_vocab_size;
+  int _n_enc_layer;  // number of encoder layer
+  int _dim_per_head;
+  int _weight_per_enc_layer;  // 12
+
+  int _head_num;
+  int _padding_id;  // for src
+  std::string _sampling_method = "topk";
+  int _topk = 4;
+  float _topp = 0.75;
+  int _eos_id;
+};
+
+}  // namespace cuda
+}  // namespace lightseq
diff --git a/lightseq/inference/proto/quant_transformer_weight.cc b/lightseq/inference/proto/quant_transformer_weight.cc
index dd03a082..1c8c8eef 100644
--- a/lightseq/inference/proto/quant_transformer_weight.cc
+++ b/lightseq/inference/proto/quant_transformer_weight.cc
@@ -28,10 +28,6 @@ __half QuantTransformerWeight<OperationType::FP16>::float2required(
   return __float2half_rn(value);
 }
 
-__inline__ float dequantize(unsigned char i, float scale, float clip_max) {
-  return (float(i) - scale) * clip_max / scale;
-}
-
 /**
 Read model config stored in custom proto file.
 */
@@ -649,16 +645,32 @@ void QuantTransformerWeight<OpType_>::hdf5_parse_emb_wei(hid_t hdf5_file,
 
   std::vector<int> offset;
   std::vector<float> value(value_size);  // preallocate vector for performance
+  std::vector<unsigned char> value_i8(value_size);
   std::cout << "loading " << value_size * sizeof(OpType_) / (1024 * 1024)
             << " MB of embedding weight." << std::endl;
   int idx = 0;
+  float clip_max;
 
   offset.push_back(idx);
   read_hdf5_dataset_data(
-      hdf5_file, dataset_prefix + "/token_embedding", H5T_NATIVE_FLOAT,
-      value.data() + idx,
+      hdf5_file, dataset_prefix + "/token_embedding", H5T_NATIVE_UCHAR,
+      value_i8.data() + idx,
       [=](int size) { return size != vocab_size * _hidden_size; },
       "Wrong token_embedding_size !");
+  read_hdf5_dataset_scalar(hdf5_file, dataset_prefix + "/emb_clip_max",
+                           H5T_NATIVE_FLOAT, &clip_max);
+  dequantize_array(value_i8, value, clip_max, _quant_range, idx,
+                   vocab_size * _hidden_size);
+  if (source == "src")
+    _src_emb_clip_max = clip_max;
+  else {
+    _trg_emb_clip_max = clip_max;
+    read_hdf5_dataset_scalar(hdf5_file, dataset_prefix + "/output_ln_clip_max",
+                             H5T_NATIVE_FLOAT, &_output_ln_clip_max);
+    read_hdf5_dataset_scalar(hdf5_file, dataset_prefix + "/logits_clip_max",
+                             H5T_NATIVE_FLOAT, &_logits_clip_max);
+  }
+
   idx += vocab_size * _hidden_size;
 
   offset.push_back(idx);
@@ -695,11 +707,24 @@ void QuantTransformerWeight<OpType_>::hdf5_parse_emb_wei(hid_t hdf5_file,
     offset.push_back(idx);
     read_hdf5_dataset_data(
         hdf5_file, dataset_prefix + "/encode_output_project_kernel_kv",
-        H5T_NATIVE_FLOAT, value.data() + idx,
+        H5T_NATIVE_UCHAR, value_i8.data() + idx,
         [=](int size) {
           return size != _hidden_size * _hidden_size * 2 * _n_dec_layer;
         },
         "Wrong encode_output_project_kernel_kv_size !");
+    _encode_output_project_kernel_kv_clip_max.resize(_n_dec_layer);
+    read_hdf5_dataset_data(
+        hdf5_file, dataset_prefix + "/encode_output_project_kernel_kv_clip_max",
+        H5T_NATIVE_FLOAT, _encode_output_project_kernel_kv_clip_max.data(),
+        [=](int size) { return size != _n_dec_layer; },
+        "Wrong encode_output_project_kernel_kv_clip_max_size !");
+    for (int i = 0; i < _n_dec_layer; ++i) {
+      dequantize_array(value_i8, value,
+                       _encode_output_project_kernel_kv_clip_max[i],
+                       _quant_range, idx + _hidden_size * _hidden_size * 2 * i,
+                       _hidden_size * _hidden_size * 2);
+    }
+
     idx += _hidden_size * _hidden_size * 2 * _n_dec_layer;
 
     offset.push_back(idx);
@@ -763,9 +788,11 @@ void QuantTransformerWeight<OpType_>::hdf5_parse_enc_wei(hid_t hdf5_file) {
       _n_enc_layer;
   std::vector<int> offset;
   std::vector<float> value(value_size);
+  std::vector<unsigned char> value_i8(value_size);
   std::cout << "loading " << value_size * sizeof(OpType_) / (1024 * 1024)
             << " MB of encoder weight." << std::endl;
 
+  float clip_max;
   int idx = 0;
   for (int layer_id = 0; layer_id < _n_enc_layer; ++layer_id) {
     std::string dataset_prefix = "encoder_stack/" + std::to_string(layer_id);
@@ -787,9 +814,15 @@ void QuantTransformerWeight<OpType_>::hdf5_parse_enc_wei(hid_t hdf5_file) {
     offset.push_back(idx);
     read_hdf5_dataset_data(
         hdf5_file, dataset_prefix + "/multihead_project_kernel_qkv",
-        H5T_NATIVE_FLOAT, value.data() + idx,
+        H5T_NATIVE_UCHAR, value_i8.data() + idx,
         [=](int size) { return size != _hidden_size * _hidden_size * 3; },
         "Wrong multihead_project_kernel_qkv_size !");
+    read_hdf5_dataset_scalar(
+        hdf5_file, dataset_prefix + "/multihead_project_kernel_qkv_clip_max",
+        H5T_NATIVE_FLOAT, &clip_max);
+    dequantize_array(value_i8, value, clip_max, _quant_range, idx,
+                     _hidden_size * _hidden_size * 3);
+    _enc_clip_max.push_back(clip_max);
     idx += _hidden_size * _hidden_size * 3;
 
     offset.push_back(idx);
@@ -803,9 +836,15 @@ void QuantTransformerWeight<OpType_>::hdf5_parse_enc_wei(hid_t hdf5_file) {
     offset.push_back(idx);
     read_hdf5_dataset_data(
         hdf5_file, dataset_prefix + "/multihead_project_kernel_output",
-        H5T_NATIVE_FLOAT, value.data() + idx,
+        H5T_NATIVE_UCHAR, value_i8.data() + idx,
         [=](int size) { return size != _hidden_size * _hidden_size; },
         "Wrong multihead_project_kernel_output_size !");
+    read_hdf5_dataset_scalar(
+        hdf5_file, dataset_prefix + "/multihead_project_kernel_output_clip_max",
+        H5T_NATIVE_FLOAT, &clip_max);
+    dequantize_array(value_i8, value, clip_max, _quant_range, idx,
+                     _hidden_size * _hidden_size);
+    _enc_clip_max.push_back(clip_max);
     idx += _hidden_size * _hidden_size;
 
     offset.push_back(idx);
@@ -832,10 +871,16 @@ void QuantTransformerWeight<OpType_>::hdf5_parse_enc_wei(hid_t hdf5_file) {
 
     offset.push_back(idx);
     read_hdf5_dataset_data(
-        hdf5_file, dataset_prefix + "/ffn_first_kernel", H5T_NATIVE_FLOAT,
-        value.data() + idx,
+        hdf5_file, dataset_prefix + "/ffn_first_kernel", H5T_NATIVE_UCHAR,
+        value_i8.data() + idx,
         [=](int size) { return size != _hidden_size * _inner_size; },
         "Wrong ffn_first_kernel_size !");
+    read_hdf5_dataset_scalar(hdf5_file,
+                             dataset_prefix + "/ffn_first_kernel_clip_max",
+                             H5T_NATIVE_FLOAT, &clip_max);
+    dequantize_array(value_i8, value, clip_max, _quant_range, idx,
+                     _hidden_size * _inner_size);
+    _enc_clip_max.push_back(clip_max);
     idx += _hidden_size * _inner_size;
 
     offset.push_back(idx);
@@ -847,10 +892,16 @@ void QuantTransformerWeight<OpType_>::hdf5_parse_enc_wei(hid_t hdf5_file) {
 
     offset.push_back(idx);
     read_hdf5_dataset_data(
-        hdf5_file, dataset_prefix + "/ffn_second_kernel", H5T_NATIVE_FLOAT,
-        value.data() + idx,
+        hdf5_file, dataset_prefix + "/ffn_second_kernel", H5T_NATIVE_UCHAR,
+        value_i8.data() + idx,
         [=](int size) { return size != _hidden_size * _inner_size; },
         "Wrong ffn_second_kernel_size !");
+    read_hdf5_dataset_scalar(hdf5_file,
+                             dataset_prefix + "/ffn_second_kernel_clip_max",
+                             H5T_NATIVE_FLOAT, &clip_max);
+    dequantize_array(value_i8, value, clip_max, _quant_range, idx,
+                     _hidden_size * _inner_size);
+    _enc_clip_max.push_back(clip_max);
     idx += _hidden_size * _inner_size;
 
     offset.push_back(idx);
@@ -860,6 +911,34 @@ void QuantTransformerWeight<OpType_>::hdf5_parse_enc_wei(hid_t hdf5_file) {
         "Wrong ffn_second_bias_size !");
     idx += _hidden_size;
 
+    read_hdf5_dataset_scalar(hdf5_file,
+                             dataset_prefix + "/multihead_ln_clip_max",
+                             H5T_NATIVE_FLOAT, &clip_max);
+    _enc_clip_max.push_back(clip_max);
+    read_hdf5_dataset_scalar(
+        hdf5_file, dataset_prefix + "/multihead_project_output_clip_max",
+        H5T_NATIVE_FLOAT, &clip_max);
+    _enc_clip_max.push_back(clip_max);
+    read_hdf5_dataset_scalar(hdf5_file, dataset_prefix + "/ffn_ln_clip_max",
+                             H5T_NATIVE_FLOAT, &clip_max);
+    _enc_clip_max.push_back(clip_max);
+    read_hdf5_dataset_scalar(hdf5_file,
+                             dataset_prefix + "/ffn_first_act_clip_max",
+                             H5T_NATIVE_FLOAT, &clip_max);
+    _enc_clip_max.push_back(clip_max);
+    read_hdf5_dataset_scalar(hdf5_file,
+                             dataset_prefix + "/multihead_qkv_dense_clip_max",
+                             H5T_NATIVE_FLOAT, &clip_max);
+    _enc_clip_max.push_back(clip_max);
+    read_hdf5_dataset_scalar(
+        hdf5_file, dataset_prefix + "/multihead_output_dense_clip_max",
+        H5T_NATIVE_FLOAT, &clip_max);
+    _enc_clip_max.push_back(clip_max);
+    read_hdf5_dataset_scalar(hdf5_file,
+                             dataset_prefix + "/ffn_first_output_clip_max",
+                             H5T_NATIVE_FLOAT, &clip_max);
+    _enc_clip_max.push_back(clip_max);
+    _enc_clip_max.push_back(0.0);
   }  // for
 
   std::vector<_DataType> raw_value;
@@ -886,9 +965,11 @@ void QuantTransformerWeight<OpType_>::hdf5_parse_dec_wei(hid_t hdf5_file) {
       _n_dec_layer;
   std::vector<int> offset;
   std::vector<float> value(value_size);
+  std::vector<unsigned char> value_i8(value_size);
   std::cout << "loading " << value_size * sizeof(OpType_) / (1024 * 1024)
             << " MB of decoder weight." << std::endl;
   int idx = 0;
+  float clip_max;
 
   for (int layer_id = 0; layer_id < _n_dec_layer; ++layer_id) {
     std::string dataset_prefix = "decoder_stack/" + std::to_string(layer_id);
@@ -910,9 +991,15 @@ void QuantTransformerWeight<OpType_>::hdf5_parse_dec_wei(hid_t hdf5_file) {
     offset.push_back(idx);
     read_hdf5_dataset_data(
         hdf5_file, dataset_prefix + "/self_project_kernel_qkv",
-        H5T_NATIVE_FLOAT, value.data() + idx,
+        H5T_NATIVE_UCHAR, value_i8.data() + idx,
         [=](int size) { return size != _hidden_size * _hidden_size * 3; },
         "Wrong self_project_kernel_qkv_size !");
+    read_hdf5_dataset_scalar(
+        hdf5_file, dataset_prefix + "/self_project_kernel_qkv_clip_max",
+        H5T_NATIVE_FLOAT, &clip_max);
+    dequantize_array(value_i8, value, clip_max, _quant_range, idx,
+                     _hidden_size * _hidden_size * 3);
+    _dec_clip_max.push_back(clip_max);
     idx += _hidden_size * _hidden_size * 3;
 
     offset.push_back(idx);
@@ -925,9 +1012,15 @@ void QuantTransformerWeight<OpType_>::hdf5_parse_dec_wei(hid_t hdf5_file) {
     offset.push_back(idx);
     read_hdf5_dataset_data(
         hdf5_file, dataset_prefix + "/self_project_kernel_output",
-        H5T_NATIVE_FLOAT, value.data() + idx,
+        H5T_NATIVE_UCHAR, value_i8.data() + idx,
         [=](int size) { return size != _hidden_size * _hidden_size; },
         "Wrong self_project_kernel_output_size !");
+    read_hdf5_dataset_scalar(
+        hdf5_file, dataset_prefix + "/self_project_kernel_output_clip_max",
+        H5T_NATIVE_FLOAT, &clip_max);
+    dequantize_array(value_i8, value, clip_max, _quant_range, idx,
+                     _hidden_size * _hidden_size);
+    _dec_clip_max.push_back(clip_max);
     idx += _hidden_size * _hidden_size;
 
     offset.push_back(idx);
@@ -955,9 +1048,15 @@ void QuantTransformerWeight<OpType_>::hdf5_parse_dec_wei(hid_t hdf5_file) {
     offset.push_back(idx);
     read_hdf5_dataset_data(
         hdf5_file, dataset_prefix + "/encdec_project_kernel_q",
-        H5T_NATIVE_FLOAT, value.data() + idx,
+        H5T_NATIVE_UCHAR, value_i8.data() + idx,
         [=](int size) { return size != _hidden_size * _hidden_size; },
         "Wrong encdec_project_kernel_q_size !");
+    read_hdf5_dataset_scalar(
+        hdf5_file, dataset_prefix + "/encdec_project_kernel_q_clip_max",
+        H5T_NATIVE_FLOAT, &clip_max);
+    dequantize_array(value_i8, value, clip_max, _quant_range, idx,
+                     _hidden_size * _hidden_size);
+    _dec_clip_max.push_back(clip_max);
     idx += _hidden_size * _hidden_size;
 
     offset.push_back(idx);
@@ -970,9 +1069,15 @@ void QuantTransformerWeight<OpType_>::hdf5_parse_dec_wei(hid_t hdf5_file) {
     offset.push_back(idx);
     read_hdf5_dataset_data(
         hdf5_file, dataset_prefix + "/encdec_project_kernel_output",
-        H5T_NATIVE_FLOAT, value.data() + idx,
+        H5T_NATIVE_UCHAR, value_i8.data() + idx,
         [=](int size) { return size != _hidden_size * _hidden_size; },
         "Wrong encdec_project_kernel_output_size !");
+    read_hdf5_dataset_scalar(
+        hdf5_file, dataset_prefix + "/encdec_project_kernel_output_clip_max",
+        H5T_NATIVE_FLOAT, &clip_max);
+    dequantize_array(value_i8, value, clip_max, _quant_range, idx,
+                     _hidden_size * _hidden_size);
+    _dec_clip_max.push_back(clip_max);
     idx += _hidden_size * _hidden_size;
 
     offset.push_back(idx);
@@ -999,10 +1104,16 @@ void QuantTransformerWeight<OpType_>::hdf5_parse_dec_wei(hid_t hdf5_file) {
 
     offset.push_back(idx);
     read_hdf5_dataset_data(
-        hdf5_file, dataset_prefix + "/ffn_first_kernel", H5T_NATIVE_FLOAT,
-        value.data() + idx,
+        hdf5_file, dataset_prefix + "/ffn_first_kernel", H5T_NATIVE_UCHAR,
+        value_i8.data() + idx,
         [=](int size) { return size != _hidden_size * _inner_size; },
         "Wrong ffn_first_kernel_size !");
+    read_hdf5_dataset_scalar(hdf5_file,
+                             dataset_prefix + "/ffn_first_kernel_clip_max",
+                             H5T_NATIVE_FLOAT, &clip_max);
+    dequantize_array(value_i8, value, clip_max, _quant_range, idx,
+                     _hidden_size * _inner_size);
+    _dec_clip_max.push_back(clip_max);
     idx += _hidden_size * _inner_size;
 
     offset.push_back(idx);
@@ -1014,10 +1125,16 @@ void QuantTransformerWeight<OpType_>::hdf5_parse_dec_wei(hid_t hdf5_file) {
 
     offset.push_back(idx);
     read_hdf5_dataset_data(
-        hdf5_file, dataset_prefix + "/ffn_second_kernel", H5T_NATIVE_FLOAT,
-        value.data() + idx,
+        hdf5_file, dataset_prefix + "/ffn_second_kernel", H5T_NATIVE_UCHAR,
+        value_i8.data() + idx,
         [=](int size) { return size != _hidden_size * _inner_size; },
         "Wrong ffn_second_kernel_size !");
+    read_hdf5_dataset_scalar(hdf5_file,
+                             dataset_prefix + "/ffn_second_kernel_clip_max",
+                             H5T_NATIVE_FLOAT, &clip_max);
+    dequantize_array(value_i8, value, clip_max, _quant_range, idx,
+                     _hidden_size * _inner_size);
+    _dec_clip_max.push_back(clip_max);
     idx += _hidden_size * _inner_size;
 
     offset.push_back(idx);
@@ -1027,6 +1144,52 @@ void QuantTransformerWeight<OpType_>::hdf5_parse_dec_wei(hid_t hdf5_file) {
         "Wrong ffn_second_bias_size !");
     idx += _hidden_size;
 
+    read_hdf5_dataset_scalar(hdf5_file, dataset_prefix + "/self_ln_clip_max",
+                             H5T_NATIVE_FLOAT, &clip_max);
+    _dec_clip_max.push_back(clip_max);
+    read_hdf5_dataset_scalar(hdf5_file,
+                             dataset_prefix + "/self_project_output_clip_max",
+                             H5T_NATIVE_FLOAT, &clip_max);
+    _dec_clip_max.push_back(clip_max);
+    read_hdf5_dataset_scalar(hdf5_file, dataset_prefix + "/encdec_ln_clip_max",
+                             H5T_NATIVE_FLOAT, &clip_max);
+    _dec_clip_max.push_back(clip_max);
+    read_hdf5_dataset_scalar(hdf5_file,
+                             dataset_prefix + "/encdec_project_output_clip_max",
+                             H5T_NATIVE_FLOAT, &clip_max);
+    _dec_clip_max.push_back(clip_max);
+    read_hdf5_dataset_scalar(hdf5_file, dataset_prefix + "/ffn_ln_clip_max",
+                             H5T_NATIVE_FLOAT, &clip_max);
+    _dec_clip_max.push_back(clip_max);
+    read_hdf5_dataset_scalar(hdf5_file,
+                             dataset_prefix + "/ffn_first_act_clip_max",
+                             H5T_NATIVE_FLOAT, &clip_max);
+    _dec_clip_max.push_back(clip_max);
+    read_hdf5_dataset_scalar(hdf5_file,
+                             dataset_prefix + "/self_qkv_dense_clip_max",
+                             H5T_NATIVE_FLOAT, &clip_max);
+    _dec_clip_max.push_back(clip_max);
+    read_hdf5_dataset_scalar(hdf5_file,
+                             dataset_prefix + "/self_output_dense_clip_max",
+                             H5T_NATIVE_FLOAT, &clip_max);
+    _dec_clip_max.push_back(clip_max);
+    read_hdf5_dataset_scalar(hdf5_file,
+                             dataset_prefix + "/encdec_q_dense_clip_max",
+                             H5T_NATIVE_FLOAT, &clip_max);
+    _dec_clip_max.push_back(clip_max);
+    read_hdf5_dataset_scalar(hdf5_file,
+                             dataset_prefix + "/encdec_output_dense_clip_max",
+                             H5T_NATIVE_FLOAT, &clip_max);
+    _dec_clip_max.push_back(clip_max);
+    read_hdf5_dataset_scalar(hdf5_file,
+                             dataset_prefix + "/ffn_first_output_clip_max",
+                             H5T_NATIVE_FLOAT, &clip_max);
+    _dec_clip_max.push_back(clip_max);
+    _dec_clip_max.push_back(0.0);
+    read_hdf5_dataset_scalar(hdf5_file,
+                             dataset_prefix + "/self_qkv_bias_out_clip_max",
+                             H5T_NATIVE_FLOAT, &clip_max);
+    _dec_clip_max.push_back(clip_max);
   }  // for
 
   std::vector<_DataType> raw_value;
diff --git a/lightseq/inference/pywrapper/CMakeLists.txt b/lightseq/inference/pywrapper/CMakeLists.txt
index 9b1a9219..771f2405 100644
--- a/lightseq/inference/pywrapper/CMakeLists.txt
+++ b/lightseq/inference/pywrapper/CMakeLists.txt
@@ -8,20 +8,34 @@ pybind11_add_module(
   gpt.cc
   bert.cc
   quant_transformer.cc
+  quant_bert.cc
+  quant_gpt.cc
   moe.cc
   vit.cc)
 target_link_libraries(lightseq PUBLIC gpt_model)
 target_link_libraries(lightseq PUBLIC bert_model)
 target_link_libraries(lightseq PUBLIC transformer_model)
 target_link_libraries(lightseq PUBLIC quant_transformer_model)
+target_link_libraries(lightseq PUBLIC quant_bert_model)
+target_link_libraries(lightseq PUBLIC quant_gpt_model)
 target_link_libraries(lightseq PUBLIC moe_model)
 target_link_libraries(lightseq PUBLIC vit_model)
 set_target_properties(lightseq PROPERTIES OUTPUT_NAME inference)
 
-add_library(liblightseq SHARED transformer.cc gpt.cc bert.cc
-                               quant_transformer.cc moe.cc vit.cc)
+add_library(
+  liblightseq SHARED
+  transformer.cc
+  gpt.cc
+  bert.cc
+  quant_transformer.cc
+  quant_bert.cc
+  quant_gpt.cc
+  moe.cc
+  vit.cc)
 target_link_libraries(liblightseq PUBLIC transformer_model)
 target_link_libraries(liblightseq PUBLIC quant_transformer_model)
+target_link_libraries(liblightseq PUBLIC quant_bert_model)
+target_link_libraries(liblightseq PUBLIC quant_gpt_model)
 target_link_libraries(liblightseq PUBLIC gpt_model)
 target_link_libraries(liblightseq PUBLIC bert_model)
 target_link_libraries(liblightseq PUBLIC moe_model)
diff --git a/lightseq/inference/pywrapper/quant_bert.cc b/lightseq/inference/pywrapper/quant_bert.cc
new file mode 100644
index 00000000..a205ca96
--- /dev/null
+++ b/lightseq/inference/pywrapper/quant_bert.cc
@@ -0,0 +1,153 @@
+#include "quant_bert.h"
+
+namespace lightseq {
+namespace cuda {
+
+QuantBert::QuantBert(const std::string weight_path, const int max_batch_size)
+    : LSModel({"token_ids"}, {"encoder_output"}),
+      _max_batch_size(max_batch_size) {
+  /* ---step1. init environment--- */
+  CHECK_GPU_ERROR(cudaSetDevice(0));
+  CHECK_GPU_ERROR(cudaStreamCreate(&stream_));
+  CHECK_GPU_ERROR(cublasCreate(&hd_));
+  CHECK_GPU_ERROR(cublasSetStream(hd_, stream_));
+
+  /* ---step2. load model weights into GPU memory--- */
+
+  // saved in custom proto file
+  std::string model_weights_path = weight_path;
+  std::string res = tw_.initializing(model_weights_path);
+  if (!res.empty()) {
+    throw std::runtime_error(res);
+  }
+
+  tw_.print_model_config();
+
+  /*
+    step3. instantiate encoder and decoder, init the gpu memory buffer.
+      using thrust vector to avoid manage gpu memory by hand
+  */
+
+  // register device memory for inputs and outputs
+  CHECK_GPU_ERROR(
+      cudaMalloc(&d_input_, _max_batch_size * tw_._max_step * sizeof(int)));
+  CHECK_GPU_ERROR(cudaMalloc(&d_padding_mask_,
+                             _max_batch_size * tw_._max_step * sizeof(int)));
+
+  CHECK_GPU_ERROR(cudaMalloc(
+      &d_encoder_output_, _max_batch_size * tw_._max_step * tw_._hidden_size *
+                              sizeof(optraits::DataType)));
+
+  encoder_ = std::make_shared<QuantBertEncoder<bert_optype>>(
+      max_batch_size, d_input_, d_padding_mask_, d_encoder_output_, tw_,
+      stream_, hd_);
+  res = encoder_->check();
+  if (!res.empty()) {
+    throw std::runtime_error(res);
+  }
+
+  encoder_->init_buffer();
+  CHECK_GPU_ERROR(cudaStreamSynchronize(stream_));
+}
+
+QuantBert::~QuantBert() {
+  CHECK_GPU_ERROR(cudaFree(d_input_));
+  CHECK_GPU_ERROR(cudaFree(d_padding_mask_));
+  CHECK_GPU_ERROR(cudaFree(d_encoder_output_));
+  CHECK_GPU_ERROR(cublasDestroy(hd_));
+  CHECK_GPU_ERROR(cudaStreamDestroy(stream_));
+}
+
+void QuantBert::Infer() {
+  int batch_size = input_shapes_[0][0], seq_len = input_shapes_[0][1];
+  encoder_->run_one_infer(batch_size, seq_len);
+  CHECK_GPU_ERROR(cudaStreamSynchronize(stream_));
+  set_output_shape(0, {batch_size, seq_len, tw_._hidden_size});
+}
+
+void QuantBert::set_input_ptr(int index, void *input_ptr) {
+  switch (index) {
+    case 0:
+      encoder_->_p_d_token_id = static_cast<int *>(input_ptr);
+      break;
+
+    default:
+      throw std::runtime_error("invalid input index");
+      break;
+  }
+}
+
+void QuantBert::set_output_ptr(int index, void *output_ptr) {
+  switch (index) {
+    case 0:
+      encoder_->_p_d_output = static_cast<optraits::DataType *>(output_ptr);
+      break;
+
+    default:
+      throw std::runtime_error("invalid output index");
+      break;
+  }
+}
+
+const void *QuantBert::get_output_ptr(int index) {
+  switch (index) {
+    case 0:
+      return static_cast<void *>(encoder_->_p_d_output);
+
+    default:
+      throw std::runtime_error("invalid output index");
+      break;
+  }
+}
+
+std::vector<int> QuantBert::get_input_max_shape(int index) {
+  switch (index) {
+    case 0:
+      return {_max_batch_size, tw_._max_step};
+
+    default:
+      throw std::runtime_error("invalid input index");
+      break;
+  }
+}
+std::vector<int> QuantBert::get_output_max_shape(int index) {
+  switch (index) {
+    case 0:
+      return {_max_batch_size, tw_._max_step, tw_._hidden_size};
+
+    default:
+      throw std::runtime_error("invalid output index");
+      break;
+  }
+}
+
+DataType QuantBert::get_input_dtype(int index) {
+  switch (index) {
+    case 0:
+      return DataType::kInt32;
+      break;
+
+    default:
+      throw std::runtime_error("invalid input index");
+      break;
+  }
+}
+
+DataType QuantBert::get_output_dtype(int index) {
+  switch (index) {
+    case 0:
+      if (bert_optype == OperationType::FP32) {
+        return DataType::kFloat32;
+      } else {
+        return DataType::kFloat16;
+      }
+      break;
+
+    default:
+      throw std::runtime_error("invalid output index");
+      break;
+  }
+}
+
+}  // namespace cuda
+}  // namespace lightseq
diff --git a/lightseq/inference/pywrapper/quant_bert.h b/lightseq/inference/pywrapper/quant_bert.h
new file mode 100644
index 00000000..79f6478d
--- /dev/null
+++ b/lightseq/inference/pywrapper/quant_bert.h
@@ -0,0 +1,48 @@
+
+#include "model_base.h"
+#include "../model/quant_bert_encoder.h"
+#include "../proto/quant_bert_weight.h"
+#include "../tools/util.h"
+
+#ifdef FP16_MODE
+const lightseq::cuda::OperationType bert_optype =
+    lightseq::cuda::OperationType::FP16;
+#else
+const lightseq::cuda::OperationType bert_optype =
+    lightseq::cuda::OperationType::FP32;
+#endif
+
+namespace lightseq {
+namespace cuda {
+class QuantBert : public LSModel {
+ private:
+  typedef OperationTypeTraits<bert_optype> optraits;
+  std::shared_ptr<QuantBertEncoder<bert_optype>> encoder_;
+
+  optraits::DataType *d_encoder_output_;
+  int *d_input_;
+  int *d_padding_mask_;
+  int _max_batch_size;
+  cudaStream_t stream_;
+  cublasHandle_t hd_;
+  QuantBertWeight<bert_optype> tw_;
+
+ public:
+  QuantBert(const std::string weight_path, const int max_batch_size);
+
+  ~QuantBert();
+
+  void Infer() override;
+  void set_input_ptr(int index, void *input_ptr) override;
+  void set_output_ptr(int index, void *output_ptr) override;
+  const void *get_output_ptr(int index) override;
+  std::vector<int> get_input_max_shape(int index) override;
+  std::vector<int> get_output_max_shape(int index) override;
+  DataType get_input_dtype(int index) override;
+  DataType get_output_dtype(int index) override;
+};
+
+LSMODEL_REGISTER(QuantBert);
+
+}  // namespace cuda
+}  // namespace lightseq
diff --git a/lightseq/inference/pywrapper/quant_gpt.cc b/lightseq/inference/pywrapper/quant_gpt.cc
new file mode 100644
index 00000000..dba7ccc2
--- /dev/null
+++ b/lightseq/inference/pywrapper/quant_gpt.cc
@@ -0,0 +1,202 @@
+#include "quant_gpt.h"
+
+namespace lightseq {
+namespace cuda {
+
+QuantGpt::QuantGpt(const std::string weight_path, const int max_batch_size)
+    : LSModel({"token_ids"}, {"result"}),
+      stream_(nullptr),
+      hd_(nullptr),
+      encoder_(nullptr),
+      _max_batch_size(max_batch_size) {
+  /* ---step1. init environment--- */
+  CHECK_GPU_ERROR(cudaSetDevice(0));
+  CHECK_GPU_ERROR(cudaStreamCreate(&stream_));
+  CHECK_GPU_ERROR(cudaStreamCreate(&cache_stream_));
+  CHECK_GPU_ERROR(cublasCreate(&hd_));
+  CHECK_GPU_ERROR(cublasSetStream(hd_, stream_));
+
+  /* ---step2. load model weights into GPU memory--- */
+
+  // saved in custom proto file
+  std::string model_weights_path = weight_path;
+  std::string res = tw_.initializing(model_weights_path);
+  if (!res.empty()) {
+    throw std::runtime_error(res);
+  }
+
+  /*
+    step3. instantiate gpt encoder, init the gpu memory buffer.
+      using thrust vector to avoid manage gpu memory by hand
+  */
+
+  // register device memory for inputs and outputs
+  CHECK_GPU_ERROR(
+      cudaMalloc(&d_input_, _max_batch_size * tw_._max_step * sizeof(int)));
+  CHECK_GPU_ERROR(
+      cudaMalloc(&d_sample_id, _max_batch_size * tw_._max_step * sizeof(int)));
+  CHECK_GPU_ERROR(cudaMalloc(&d_ppl, _max_batch_size * sizeof(float)));
+
+  encoder_ = std::make_shared<QuantGptEncoder<gpt_optype>>(
+      max_batch_size, d_input_, d_ppl, d_sample_id, tw_, stream_, cache_stream_,
+      hd_);
+  res = encoder_->check();
+  if (!res.empty()) {
+    throw std::runtime_error(res);
+  }
+
+  encoder_->init_buffer();
+  CHECK_GPU_ERROR(cudaStreamSynchronize(stream_));
+}
+
+QuantGpt::~QuantGpt() {
+  CHECK_GPU_ERROR(cudaFree(d_input_));
+  CHECK_GPU_ERROR(cudaFree(d_sample_id));
+  CHECK_GPU_ERROR(cudaFree(d_ppl));
+  CHECK_GPU_ERROR(cudaStreamDestroy(stream_));
+  CHECK_GPU_ERROR(cudaStreamDestroy(cache_stream_));
+  CHECK_GPU_ERROR(cublasDestroy(hd_));
+}
+
+const int* QuantGpt::get_result_ptr() { return d_sample_id; }
+const float* QuantGpt::get_score_ptr() { return d_ppl; }
+
+void QuantGpt::Infer() {
+  int batch_size = input_shapes_[0][0], seq_len = input_shapes_[0][1];
+
+  if (tw_._sampling_method == "ppl") {
+    encoder_->run_one_infer(batch_size, seq_len);
+    CHECK_GPU_ERROR(cudaStreamSynchronize(stream_));
+    set_output_shape(0, {batch_size});
+  } else if (tw_._sampling_method == "topk" || tw_._sampling_method == "topp") {
+    int sampled_seq_len = encoder_->run_one_sample(batch_size, seq_len);
+    CHECK_GPU_ERROR(cudaStreamSynchronize(stream_));
+    set_output_shape(0, {batch_size, sampled_seq_len});
+  } else {
+    throw std::runtime_error("Unsupported sampling_method");
+  }
+}
+
+void QuantGpt::set_input_ptr(int index, void* input_ptr) {
+  switch (index) {
+    case 0:
+      encoder_->_p_d_token_id = static_cast<int*>(input_ptr);
+      break;
+
+    default:
+      throw std::runtime_error("invalid input index");
+      break;
+  }
+}
+
+void QuantGpt::set_output_ptr(int index, void* output_ptr) {
+  switch (index) {
+    case 0:
+      if (tw_._sampling_method == "ppl") {
+        encoder_->_p_d_ppl = static_cast<float*>(output_ptr);
+        break;
+      } else if (tw_._sampling_method == "topk" ||
+                 tw_._sampling_method == "topp") {
+        encoder_->_p_d_sample_id = static_cast<int*>(output_ptr);
+        break;
+
+      } else {
+        throw std::runtime_error("Unsupported sampling_method");
+        break;
+      }
+
+    default:
+      throw std::runtime_error("invalid output index");
+      break;
+  }
+}
+
+const void* QuantGpt::get_output_ptr(int index) {
+  switch (index) {
+    case 0:
+      if (tw_._sampling_method == "ppl") {
+        return static_cast<void*>(encoder_->_p_d_ppl);
+        break;
+      } else if (tw_._sampling_method == "topk" ||
+                 tw_._sampling_method == "topp") {
+        return static_cast<void*>(encoder_->_p_d_sample_id);
+        break;
+      } else {
+        throw std::runtime_error("Unsupported sampling_method");
+        break;
+      }
+
+    default:
+      throw std::runtime_error("invalid output index");
+      break;
+  }
+}
+
+std::vector<int> QuantGpt::get_input_max_shape(int index) {
+  switch (index) {
+    case 0:
+      return {_max_batch_size, tw_._max_step};
+
+    default:
+      throw std::runtime_error("invalid input index");
+      break;
+  }
+}
+
+std::vector<int> QuantGpt::get_output_max_shape(int index) {
+  switch (index) {
+    case 0:
+
+      if (tw_._sampling_method == "ppl") {
+        return {_max_batch_size};
+        break;
+      } else if (tw_._sampling_method == "topk" ||
+                 tw_._sampling_method == "topp") {
+        return {_max_batch_size, tw_._max_step};
+        break;
+      } else {
+        throw std::runtime_error("Unsupported sampling_method");
+        break;
+      }
+
+    default:
+      throw std::runtime_error("invalid output index");
+      break;
+  }
+}
+
+DataType QuantGpt::get_input_dtype(int index) {
+  switch (index) {
+    case 0:
+      return DataType::kInt32;
+      break;
+
+    default:
+      throw std::runtime_error("invalid input index");
+      break;
+  }
+}
+
+DataType QuantGpt::get_output_dtype(int index) {
+  switch (index) {
+    case 0:
+      if (tw_._sampling_method == "ppl") {
+        return DataType::kFloat32;
+        break;
+      } else if (tw_._sampling_method == "topk" ||
+                 tw_._sampling_method == "topp") {
+        return DataType::kInt32;
+        break;
+      } else {
+        throw std::runtime_error("Unsupported sampling_method");
+        break;
+      }
+
+    default:
+      throw std::runtime_error("invalid output index");
+      break;
+  }
+}
+
+}  // namespace cuda
+}  // namespace lightseq
diff --git a/lightseq/inference/pywrapper/quant_gpt.h b/lightseq/inference/pywrapper/quant_gpt.h
new file mode 100644
index 00000000..6032b580
--- /dev/null
+++ b/lightseq/inference/pywrapper/quant_gpt.h
@@ -0,0 +1,55 @@
+
+#include "model_base.h"
+#include "../model/quant_gpt_encoder.h"
+#include "../proto/quant_gpt_weight.h"
+#include "../tools/util.h"
+
+#ifdef FP16_MODE
+const lightseq::cuda::OperationType gpt_optype =
+    lightseq::cuda::OperationType::FP16;
+#else
+const lightseq::cuda::OperationType gpt_optype =
+    lightseq::cuda::OperationType::FP32;
+#endif
+
+namespace lightseq {
+namespace cuda {
+class QuantGpt : public LSModel {
+ private:
+  typedef lightseq::cuda::OperationTypeTraits<gpt_optype> optraits;
+  std::shared_ptr<lightseq::cuda::QuantGptEncoder<gpt_optype>> encoder_;
+
+  int* d_input_;
+  int* d_sample_id;
+  float* d_ppl;
+
+  int _max_batch_size;
+  cudaStream_t stream_;
+  cudaStream_t cache_stream_;
+  cublasHandle_t hd_;
+  lightseq::cuda::QuantGptWeight<gpt_optype> tw_;
+  std::set<std::string> available_sampling_methods = {"topk", "topp"};
+
+ public:
+  QuantGpt(const std::string weight_path, const int max_batch_size);
+
+  ~QuantGpt();
+
+  const int* get_result_ptr();
+  const float* get_score_ptr();
+  const int get_max_step() { return tw_._max_step; }
+
+  void Infer() override;
+  void set_input_ptr(int index, void* input_ptr) override;
+  void set_output_ptr(int index, void* output_ptr) override;
+  const void* get_output_ptr(int index) override;
+  std::vector<int> get_input_max_shape(int index) override;
+  std::vector<int> get_output_max_shape(int index) override;
+  DataType get_input_dtype(int index) override;
+  DataType get_output_dtype(int index) override;
+};
+
+LSMODEL_REGISTER(QuantGpt);
+
+}  // namespace cuda
+}  // namespace lightseq
diff --git a/lightseq/inference/pywrapper/wrapper.cc b/lightseq/inference/pywrapper/wrapper.cc
index 1d5de797..ca5cbb2f 100644
--- a/lightseq/inference/pywrapper/wrapper.cc
+++ b/lightseq/inference/pywrapper/wrapper.cc
@@ -237,6 +237,88 @@ class PyBert {
   }
 };
 
+class PyQuantBert {
+ private:
+  lightseq::cuda::LSModel *model_;
+  int *d_input_;
+  std::vector<void *> d_outputs_;
+
+ public:
+  PyQuantBert(std::string weight_path, int max_batch_size) {
+    model_ = lightseq::cuda::LSModelFactory::GetInstance().CreateModel(
+        "QuantBert", weight_path, max_batch_size);
+    std::vector<int> max_input_shape = model_->get_input_max_shape(0);
+    int max_size =
+        std::accumulate(max_input_shape.begin(), max_input_shape.end(), 1,
+                        std::multiplies<int>());
+    lightseq::cuda::CHECK_GPU_ERROR(
+        cudaMalloc(&d_input_, sizeof(int) * max_size));
+
+    for (int i = 0; i < model_->get_output_size(); i++) {
+      void *d_output;
+      std::vector<int> shape = model_->get_output_max_shape(i);
+      int output_size = std::accumulate(shape.begin(), shape.end(), 1,
+                                        std::multiplies<int>());
+      lightseq::cuda::CHECK_GPU_ERROR(
+          cudaMalloc(&d_output, output_size * sizeof(int)));
+      model_->set_output_ptr(i, d_output);
+      d_outputs_.push_back(d_output);
+    }
+  }
+  ~PyQuantBert() {
+    delete model_;
+    lightseq::cuda::CHECK_GPU_ERROR(cudaFree(d_input_));
+    for (auto d_output : d_outputs_) {
+      lightseq::cuda::CHECK_GPU_ERROR(cudaFree(d_output));
+    }
+  }
+
+  py::array_t<float> infer(
+      py::array_t<int, py::array::c_style | py::array::forcecast> input_seq) {
+    auto input_seq_out = input_seq.mutable_unchecked<2>();
+    const int *input_seq_data = input_seq_out.data(0, 0);
+    int batch_size = input_seq_out.shape(0);
+    int batch_seq_len = input_seq_out.shape(1);
+
+    lightseq::cuda::CHECK_GPU_ERROR(
+        cudaMemcpy(d_input_, input_seq_data, sizeof(int) * input_seq_out.size(),
+                   cudaMemcpyHostToDevice));
+
+    model_->set_input_ptr(0, d_input_);
+    model_->set_input_shape(0, {batch_size, batch_seq_len});
+
+    model_->Infer();
+
+    std::vector<int> output_shape = model_->get_output_shape(0);
+    auto output = py::array_t<float>(output_shape);
+    float *output_data = output.mutable_data(0, 0);
+    lightseq::cuda::DataType output_type = model_->get_output_dtype(0);
+    if (output_type == lightseq::cuda::kFloat32) {
+      const float *d_output =
+          static_cast<const float *>(model_->get_output_ptr(0));
+
+      lightseq::cuda::CHECK_GPU_ERROR(cudaMemcpy(output_data, d_output,
+                                                 sizeof(float) * output.size(),
+                                                 cudaMemcpyDeviceToHost));
+    } else if (output_type == lightseq::cuda::kFloat16) {
+      const half *d_output =
+          static_cast<const half *>(model_->get_output_ptr(0));
+      std::vector<half> h_bert_out(output.size());
+      lightseq::cuda::CHECK_GPU_ERROR(cudaMemcpy(h_bert_out.data(), d_output,
+                                                 sizeof(half) * output.size(),
+                                                 cudaMemcpyDeviceToHost));
+      for (auto i = 0; i < h_bert_out.size(); i++) {
+        float f_data = __half2float(h_bert_out[i]);
+        output_data[i] = f_data;
+      }
+    } else {
+      throw std::runtime_error("Not supported output type");
+    }
+
+    return output;
+  }
+};
+
 class PyGpt {
  private:
   lightseq::cuda::LSModel *model_;
@@ -306,6 +388,111 @@ class PyGpt {
     return output;
   }
 
+  py::array_t<float> ppl(
+      py::array_t<int, py::array::c_style | py::array::forcecast> input_seq) {
+    auto input_seq_out = input_seq.mutable_unchecked<2>();
+    const int *input_seq_data = input_seq_out.data(0, 0);
+    int batch_size = input_seq_out.shape(0);
+    int batch_seq_len = input_seq_out.shape(1);
+
+    if (model_->get_output_dtype(0) != lightseq::cuda::DataType::kFloat32) {
+      throw std::runtime_error(
+          "This model is not for ppl, you should set the sampling_method to "
+          "topk or topp");
+    }
+
+    lightseq::cuda::CHECK_GPU_ERROR(
+        cudaMemcpy(d_input_, input_seq_data, sizeof(int) * input_seq_out.size(),
+                   cudaMemcpyHostToDevice));
+
+    model_->set_input_ptr(0, d_input_);
+    model_->set_input_shape(0, {batch_size, batch_seq_len});
+
+    model_->Infer();
+
+    std::vector<int> output_shape = model_->get_output_shape(0);
+
+    auto output = py::array_t<float>(output_shape);
+    float *output_data = output.mutable_data();
+    const float *d_output =
+        static_cast<const float *>(model_->get_output_ptr(0));
+    lightseq::cuda::CHECK_GPU_ERROR(cudaMemcpy(output_data, d_output,
+                                               sizeof(float) * output.size(),
+                                               cudaMemcpyDeviceToHost));
+
+    return output;
+  }
+};
+
+class PyQuantGpt {
+ private:
+  lightseq::cuda::LSModel *model_;
+  int *d_input_;
+  std::vector<void *> d_outputs_;
+
+ public:
+  PyQuantGpt(std::string weight_path, int max_batch_size) {
+    model_ = lightseq::cuda::LSModelFactory::GetInstance().CreateModel(
+        "QuantGpt", weight_path, max_batch_size);
+    std::vector<int> max_input_shape = model_->get_input_max_shape(0);
+    int max_size =
+        std::accumulate(max_input_shape.begin(), max_input_shape.end(), 1,
+                        std::multiplies<int>());
+    lightseq::cuda::CHECK_GPU_ERROR(
+        cudaMalloc(&d_input_, sizeof(int) * max_size));
+
+    for (int i = 0; i < model_->get_output_size(); i++) {
+      void *d_output;
+      std::vector<int> shape = model_->get_output_max_shape(i);
+      int output_size = std::accumulate(shape.begin(), shape.end(), 1,
+                                        std::multiplies<int>());
+      lightseq::cuda::CHECK_GPU_ERROR(
+          cudaMalloc(&d_output, output_size * sizeof(int)));
+      model_->set_output_ptr(i, d_output);
+      d_outputs_.push_back(d_output);
+    }
+  }
+  ~PyQuantGpt() {
+    delete model_;
+    lightseq::cuda::CHECK_GPU_ERROR(cudaFree(d_input_));
+    for (auto d_output : d_outputs_) {
+      lightseq::cuda::CHECK_GPU_ERROR(cudaFree(d_output));
+    }
+  }
+
+  py::array_t<int> sample(
+      py::array_t<int, py::array::c_style | py::array::forcecast> input_seq) {
+    auto input_seq_out = input_seq.mutable_unchecked<2>();
+    const int *input_seq_data = input_seq_out.data(0, 0);
+    int batch_size = input_seq_out.shape(0);
+    int batch_seq_len = input_seq_out.shape(1);
+    if (model_->get_output_dtype(0) != lightseq::cuda::DataType::kInt32) {
+      throw std::runtime_error(
+          "This model is not for sample, maybe you have set the "
+          "sampling_method to "
+          "ppl");
+    }
+
+    lightseq::cuda::CHECK_GPU_ERROR(
+        cudaMemcpy(d_input_, input_seq_data, sizeof(int) * input_seq_out.size(),
+                   cudaMemcpyHostToDevice));
+
+    model_->set_input_ptr(0, d_input_);
+    model_->set_input_shape(0, {batch_size, batch_seq_len});
+
+    model_->Infer();
+
+    std::vector<int> output_shape = model_->get_output_shape(0);
+    auto output = py::array_t<int>(output_shape);
+    int *output_data = output.mutable_data(0, 0);
+    const int *d_output = static_cast<const int *>(model_->get_output_ptr(0));
+    lightseq::cuda::CHECK_GPU_ERROR(cudaMemcpy(output_data, d_output,
+                                               sizeof(int) * output.size(),
+                                               cudaMemcpyDeviceToHost));
+
+    return output;
+  }
+
   py::array_t<float> ppl(
       py::array_t<int, py::array::c_style | py::array::forcecast> input_seq) {
     auto input_seq_out = input_seq.mutable_unchecked<2>();
@@ -331,7 +518,7 @@ class PyGpt {
     std::vector<int> output_shape = model_->get_output_shape(0);
 
     auto output = py::array_t<float>(output_shape);
-    float *output_data = output.mutable_data(0, 0);
+    float *output_data = output.mutable_data();
     const float *d_output =
         static_cast<const float *>(model_->get_output_ptr(0));
     lightseq::cuda::CHECK_GPU_ERROR(cudaMemcpy(output_data, d_output,
@@ -526,12 +713,26 @@ PYBIND11_MODULE(inference, m) {
       .def("sample", &PyGpt::sample,
            py::return_value_policy::reference_internal, py::arg("input_seq"));
 
+  py::class_<PyQuantGpt>(m, "QuantGpt")
+      .def(py::init<const std::string, const int>(), py::arg("weight_path"),
+           py::arg("max_batch_size"))
+      .def("ppl", &PyQuantGpt::ppl, py::return_value_policy::reference_internal,
+           py::arg("input_seq"))
+      .def("sample", &PyQuantGpt::sample,
+           py::return_value_policy::reference_internal, py::arg("input_seq"));
+
   py::class_<PyBert>(m, "Bert")
       .def(py::init<const std::string, const int>(), py::arg("weight_path"),
            py::arg("max_batch_size"))
       .def("infer", &PyBert::infer, py::return_value_policy::reference_internal,
            py::arg("input_seq"));
 
+  py::class_<PyQuantBert>(m, "QuantBert")
+      .def(py::init<const std::string, const int>(), py::arg("weight_path"),
+           py::arg("max_batch_size"))
+      .def("infer", &PyQuantBert::infer,
+           py::return_value_policy::reference_internal, py::arg("input_seq"));
+
   py::class_<PyMoe>(m, "Moe")
       .def(py::init<const std::string, const int>(), py::arg("weight_path"),
            py::arg("max_batch_size"))
diff --git a/lightseq/inference/tools/util.cc.cu b/lightseq/inference/tools/util.cc.cu
index b25b599e..31e247f5 100644
--- a/lightseq/inference/tools/util.cc.cu
+++ b/lightseq/inference/tools/util.cc.cu
@@ -343,5 +343,16 @@ int read_hdf5_dataset_scalar(hid_t hdf5_file, std::string dataset_name,
       [](int size) { return size != 1; }, "Expect scalar with shape of 1.");
 }
 
+float dequantize(unsigned char i, float scale, float clip_max) {
+  return (float(i) - scale) * clip_max / scale;
+}
+
+void dequantize_array(std::vector<unsigned char>& i8, std::vector<float>& f,
+                      float clip_max, float quant_range, int start, int num) {
+  for (int i = start; i < start + num; ++i) {
+    f[i] = dequantize(i8[i], quant_range, clip_max);
+  }
+}
+
 }  // namespace cuda
 }  // namespace lightseq
diff --git a/lightseq/inference/tools/util.h b/lightseq/inference/tools/util.h
index 468092a9..310144f5 100644
--- a/lightseq/inference/tools/util.h
+++ b/lightseq/inference/tools/util.h
@@ -225,5 +225,10 @@ T* to_gpu(const T* host_pointer, int size, cudaStream_t stream) {
   return gpu_pointer;
 }
 
+float dequantize(unsigned char i, float scale, float clip_max);
+
+void dequantize_array(std::vector<unsigned char>& i8, std::vector<float>& f,
+                      float clip_max, float quant_range, int start, int num);
+
 }  // namespace cuda
 }  // namespace lightseq
diff --git a/lightseq/training/README.md b/lightseq/training/README.md
index 6147a4d8..65656a31 100644
--- a/lightseq/training/README.md
+++ b/lightseq/training/README.md
@@ -21,7 +21,7 @@ With only a few lines of code, you can enjoy the excellent performance provided
 ## Features
 - **High performance**.
 In WMT14 English to German dataset, compared to [Fairseq](https://github.com/pytorch/fairseq) with [Apex](https://github.com/NVIDIA/apex),
-LightSeq can provide **1.53** times speedup for transformer big model on NVIDIA Ampere A100 with 4096 batch size.
+LightSeq can provide **1.53** times speedup for transformer big model on NVIDIA Tesla A100 with 4096 batch size.
 - **Comprehensive operators**.
 LightSeq provides comprehensive efficient custom operators for PyTorch and TensorFlow, including embedding, encoder layer, decoder layer, criterion and optimizer. To the best of our knowledge, LightSeq is the first open source project that cover the entire training process for Transformer-based models.
 In contrast, [DeepSpeed](https://github.com/microsoft/DeepSpeed) only provides encoder layer.
@@ -38,7 +38,7 @@ The following is a support matrix of LightSeq compared with
 ## Performance
 Detailed experimental results is available [here](../../docs/training/performance.md). Here are the experimental results on WMT14 English to German task.
 
-We train transformer models of different sizes on eight NVIDIA Tesla V100/NVIDIA Ampere A100 GPUs with data parallel and fp16 mixed precision.
+We train transformer models of different sizes on eight NVIDIA Tesla V100/NVIDIA Tesla A100 GPUs with data parallel and fp16 mixed precision.
 [Fairseq](https://github.com/pytorch/fairseq) with [Apex](https://github.com/NVIDIA/apex) is choosed as our baseline.
 
 ### Speedup for single training step
@@ -58,15 +58,15 @@ We compute speedup on different batch size using the WPS (real words per second)
 To install LightSeq training library,
 
 ```shell
-pip install lightseq
+$ pip install lightseq
 ```
 
 or install in develop mode,
 
 ```shell
-git clone https://github.com/bytedance/lightseq.git
-cd lightseq
-pip install -e .
+$ git clone https://github.com/bytedance/lightseq.git
+$ cd lightseq
+$ pip install -e .
 ```
 
 ### TensorFlow
@@ -75,7 +75,7 @@ pip install -e .
 - Cuda version = 11.0
 - To install LightSeq training library:
 ```shell
-pip install http://sf3-ttcdn-tos.pstatp.com/obj/nlp-opensource/lightseq/tensorflow/lightseq_tf-2.0.1-cp37-cp37m-linux_x86_64.whl
+$ pip install http://sf3-ttcdn-tos.pstatp.com/obj/nlp-opensource/lightseq/tensorflow/lightseq_tf-2.0.1-cp37-cp37m-linux_x86_64.whl
 ```
 
 ## Usage
diff --git a/lightseq/training/__init__.py b/lightseq/training/__init__.py
index 8059c730..e7da197b 100644
--- a/lightseq/training/__init__.py
+++ b/lightseq/training/__init__.py
@@ -7,9 +7,9 @@
 from lightseq.training.ops.pytorch.transformer_decoder_layer import (
     LSTransformerDecoderLayer,
 )
-from lightseq.training.ops.pytorch.gpt_encoder_layer import (
+from lightseq.training.ops.pytorch.gpt_layer import (
     LSGptEncoderLayer,
-    ls_hf_gpt_convert,
+    ls_hf_gpt_enc_convert,
 )
 from lightseq.training.ops.pytorch.transformer import (
     LSTransformer,
@@ -27,8 +27,9 @@
     export_pb2hdf5,
 )
 
-from lightseq.training.ops.pytorch.export_ptq import (
+from lightseq.training.ops.pytorch.export_quant import (
     export_ls_embedding_ptq,
     export_ls_encoder_ptq,
     export_ls_decoder_ptq,
+    export_quant_pb2hdf5,
 )
diff --git a/lightseq/training/cli/lightseq_infer_cli.py b/lightseq/training/cli/lightseq_infer_cli.py
index 6283f926..52873664 100644
--- a/lightseq/training/cli/lightseq_infer_cli.py
+++ b/lightseq/training/cli/lightseq_infer_cli.py
@@ -98,10 +98,12 @@ def _main(args, output_file):
     )
 
     # Initialize LightSeq model
+    # NOTE: QuantTransformer can not load float models, but Transformer can load int8 models.
+    # So QuantTransformer must be initialized first.
     try:
-        model = lsi.Transformer(args.path, args.batch_size)
-    except:
         model = lsi.QuantTransformer(args.path, args.batch_size)
+    except:
+        model = lsi.Transformer(args.path, args.batch_size)
 
     gen_timer = StopwatchMeter()
 
diff --git a/lightseq/training/ops/pytorch/export.py b/lightseq/training/ops/pytorch/export.py
index cf4ec914..d8dac8e0 100644
--- a/lightseq/training/ops/pytorch/export.py
+++ b/lightseq/training/ops/pytorch/export.py
@@ -73,8 +73,8 @@ def check_rule(tensor_name, rule):
     except:
         target_tensor = tt["save"]
     print(
-        "%s -> %s, shape: %s, convert finished!"
-        % (target_tn if target_tn else "created", proto_name, target_tensor.shape)
+        "%s -> %s, convert finished!"
+        % (target_tn if target_tn else "created", proto_name)
     )
     return target_tensor
 
@@ -361,7 +361,7 @@ def export_ls_config(
 
 
 def export_pb2hdf5(transformer, f):
-    """Convert bart protobuf to hdf5 format to support larger weight."""
+    """Convert Transformer protobuf to hdf5 format to support larger weight."""
     MODEL_CONF_KEYS = [
         # model_conf
         "head_num",
diff --git a/lightseq/training/ops/pytorch/export_ptq.py b/lightseq/training/ops/pytorch/export_quant.py
similarity index 65%
rename from lightseq/training/ops/pytorch/export_ptq.py
rename to lightseq/training/ops/pytorch/export_quant.py
index 2ae90314..17f56715 100644
--- a/lightseq/training/ops/pytorch/export_ptq.py
+++ b/lightseq/training/ops/pytorch/export_quant.py
@@ -318,3 +318,177 @@ def export_ls_decoder_ptq(
         enc_out_mapping_dict,
         nlayer,
     )
+
+
+def export_quant_pb2hdf5(transformer, f):
+    """Convert QuantTransformer protobuf to hdf5 format to support larger weight."""
+    MODEL_CONF_KEYS = [
+        # model_conf
+        "head_num",
+        "beam_size",
+        "extra_decode_length",
+        "length_penalty",
+        "src_padding_id",
+        "trg_start_id",
+        "diverse_lambda",
+        "sampling_method",
+        "topp",
+        "topk",
+        "trg_end_id",
+        "is_post_ln",
+        "no_scale_embedding",
+        "use_gelu",
+        "multilg_type",
+    ]
+
+    EMBEDDING_KEYS = [
+        # src_embedding
+        # trg_embedding
+        "token_embedding",
+        "position_embedding",
+        "norm_scale",
+        "norm_bias",
+        "encode_output_project_kernel_kv",
+        "encode_output_project_bias_kv",
+        "shared_bias",
+        "lang_emb",
+        "trg_vocab_mask",
+        "emb_clip_max",
+        "encode_output_project_kernel_kv_clip_max",
+        "output_ln_clip_max",
+        "logits_clip_max",
+    ]
+
+    ENCODER_LAYER_KEYS = [
+        # encoder_stack/{i}
+        "multihead_norm_scale",
+        "multihead_norm_bias",
+        "multihead_project_kernel_qkv",
+        "multihead_project_bias_qkv",
+        "multihead_project_kernel_output",
+        "multihead_project_bias_output",
+        "ffn_norm_scale",
+        "ffn_norm_bias",
+        "ffn_first_kernel",
+        "ffn_first_bias",
+        "ffn_second_kernel",
+        "ffn_second_bias",
+        "multihead_project_kernel_qkv_clip_max",
+        "multihead_project_kernel_output_clip_max",
+        "ffn_first_kernel_clip_max",
+        "ffn_second_kernel_clip_max",
+        "multihead_ln_clip_max",
+        "multihead_project_output_clip_max",
+        "ffn_ln_clip_max",
+        "ffn_first_act_clip_max",
+        "multihead_qkv_dense_clip_max",
+        "multihead_output_dense_clip_max",
+        "ffn_first_output_clip_max",
+    ]
+
+    DECODER_LAYER_KEYS = [
+        # decoder_stack/{i}
+        "self_norm_scale",
+        "self_norm_bias",
+        "self_project_kernel_qkv",
+        "self_project_bias_qkv",
+        "self_project_kernel_output",
+        "self_project_bias_output",
+        "encdec_norm_scale",
+        "encdec_norm_bias",
+        "encdec_project_kernel_q",
+        "encdec_project_bias_q",
+        "encdec_project_kernel_output",
+        "encdec_project_bias_output",
+        "ffn_norm_scale",
+        "ffn_norm_bias",
+        "ffn_first_kernel",
+        "ffn_first_bias",
+        "ffn_second_kernel",
+        "ffn_second_bias",
+        "self_project_kernel_qkv_clip_max",
+        "self_project_kernel_output_clip_max",
+        "encdec_project_kernel_q_clip_max",
+        "encdec_project_kernel_output_clip_max",
+        "ffn_first_kernel_clip_max",
+        "ffn_second_kernel_clip_max",
+        "self_ln_clip_max",
+        "self_project_output_clip_max",
+        "encdec_ln_clip_max",
+        "encdec_project_output_clip_max",
+        "ffn_ln_clip_max",
+        "ffn_first_act_clip_max",
+        "self_qkv_dense_clip_max",
+        "self_output_dense_clip_max",
+        "encdec_q_dense_clip_max",
+        "encdec_output_dense_clip_max",
+        "ffn_first_output_clip_max",
+        "self_qkv_bias_out_clip_max",
+    ]
+    base_attr_to_keys = {
+        "src_embedding": EMBEDDING_KEYS,
+        "trg_embedding": EMBEDDING_KEYS,
+        "model_conf": MODEL_CONF_KEYS,
+    }
+
+    from operator import attrgetter
+
+    print(f"start converting protobuf to hdf5 format.")
+    # load src_embedding, trg_embedding, model_conf
+    for base_attr, keys in base_attr_to_keys.items():
+        for key in keys:
+            hdf5_key = f"{base_attr}/{key}"
+            proto_attr = f"{base_attr}.{key}"
+
+            if key not in dir(attrgetter(base_attr)(transformer)):
+                print(f"key {key} not found in {base_attr}, skipping")
+                continue
+
+            print(f"loading transformer {proto_attr} -> {hdf5_key}")
+            _data = attrgetter(proto_attr)(transformer)
+            if type(_data) is str:
+                print(
+                    f"find type str, explicitly convert string to ascii encoded array."
+                )
+                # explict convert to array of char (int8) to avoid issues on string reading in C
+                _data = np.array([ord(c) for c in _data]).astype(np.int8)
+            elif type(_data) is bytes:
+                print(
+                    f"find type bytes, explicitly convert bytes to unsigned int8 array."
+                )
+                _data = np.array(bytearray(_data)).astype(np.ubyte)
+            f.create_dataset(hdf5_key, data=_data)
+
+    # save number of layers metadata
+    f.create_dataset("model_conf/n_encoder_stack", data=len(transformer.encoder_stack))
+    f.create_dataset("model_conf/n_decoder_stack", data=len(transformer.decoder_stack))
+
+    # load encoder_stack
+    for layer_id, layer in enumerate(transformer.encoder_stack):
+        for key in ENCODER_LAYER_KEYS:
+            hdf5_key = f"encoder_stack/{layer_id}/{key}"
+            proto_attr = key
+            print(f"loading transformer.encoder_stack {proto_attr} -> {hdf5_key}")
+            _data = attrgetter(proto_attr)(layer)
+            if type(_data) is bytes:
+                print(
+                    f"find type bytes, explicitly convert bytes to unsigned int8 array."
+                )
+                _data = np.array(bytearray(_data)).astype(np.ubyte)
+            f.create_dataset(hdf5_key, data=_data)
+
+    # load decoder_stack
+    for layer_id, layer in enumerate(transformer.decoder_stack):
+        for key in DECODER_LAYER_KEYS:
+            hdf5_key = f"decoder_stack/{layer_id}/{key}"
+            proto_attr = key
+            print(f"loading transformer.decoder_stack {proto_attr} -> {hdf5_key}")
+            _data = attrgetter(proto_attr)(layer)
+            if type(_data) is bytes:
+                print(
+                    f"find type bytes, explicitly convert bytes to unsigned int8 array."
+                )
+                _data = np.array(bytearray(_data)).astype(np.ubyte)
+            f.create_dataset(hdf5_key, data=_data)
+
+    print(f"proto to hdf5 conversion completed.")
diff --git a/lightseq/training/ops/pytorch/gpt_encoder_layer.py b/lightseq/training/ops/pytorch/gpt_layer.py
similarity index 78%
rename from lightseq/training/ops/pytorch/gpt_encoder_layer.py
rename to lightseq/training/ops/pytorch/gpt_layer.py
index ef6409a0..3a5c8a57 100644
--- a/lightseq/training/ops/pytorch/gpt_encoder_layer.py
+++ b/lightseq/training/ops/pytorch/gpt_layer.py
@@ -1,16 +1,9 @@
-import math
-from dataclasses import dataclass
-
 import torch
-from torch import nn
-from torch.autograd import Function
-from lightseq.training.ops.pytorch.builder.transformer_builder import TransformerBuilder
 
 from lightseq.training.ops.pytorch import transformer_cuda_module
 from lightseq.training.ops.pytorch.transformer_encoder_layer import (
     LSTransformerEncoderLayer,
 )
-from lightseq.training.ops.pytorch.util import copy_para
 
 
 class LSGptEncoderLayer(LSTransformerEncoderLayer):
@@ -63,29 +56,28 @@ def create_cpp_layer(self):
 
     @staticmethod
     def from_huggingface(layer, training_args, model_config):
-        ls_gpt_config = gen_ls_gpt_config(training_args, model_config)
-        init_ws, init_bs = get_hf_gpt_layer_params(layer, ls_gpt_config)
-        return LSHFGptLayer(ls_gpt_config, init_ws, init_bs).cuda()
+        ls_gpt_config = gen_ls_gpt_enc_config(training_args, model_config)
+        init_ws, init_bs = get_hf_gpt_enc_layer_params(layer, ls_gpt_config)
+        return LSHFGptEncoderLayer(ls_gpt_config, init_ws, init_bs).cuda()
 
 
-class LSHFGptLayer(LSGptEncoderLayer):
+class LSHFGptEncoderLayer(LSGptEncoderLayer):
     def __init__(self, *args, **kwargs):
-        super(LSHFGptLayer, self).__init__(*args, **kwargs)
+        super(LSHFGptEncoderLayer, self).__init__(*args, **kwargs)
 
     def forward(self, hidden_states, attention_mask=None, *args, **kwargs):
         # attention mask from transformers is a tensor.
         # sizes are[batch_size, 1, 1, to_seq_length]
         # masked value is -10000.0, unmasked value is 0.0
         if attention_mask is not None:
-            attention_mask = attention_mask.squeeze()
-            attention_mask = attention_mask / -10000
+            ls_attention_mask = attention_mask.squeeze()
         else:
-            attention_mask = torch.zeros(hidden_states.size()[:2])
-        output = super().forward(hidden_states, attention_mask)
+            ls_attention_mask = torch.zeros(hidden_states.size()[:2])
+        output = super().forward(hidden_states, ls_attention_mask)
         return (output, None, None, None)
 
 
-def gen_ls_gpt_config(training_args, config):
+def gen_ls_gpt_enc_config(training_args, config):
     gpt_config = LSGptEncoderLayer.get_config(
         max_batch_tokens=8192,
         max_seq_len=config.max_position_embeddings,
@@ -94,7 +86,7 @@ def gen_ls_gpt_config(training_args, config):
         nhead=config.num_attention_heads,
         attn_prob_dropout_ratio=config.attn_pdrop,
         activation_dropout_ratio=config.resid_pdrop,
-        hidden_dropout_ratio=config.embd_pdrop,
+        hidden_dropout_ratio=config.resid_pdrop,
         pre_layer_norm=True,
         fp16=training_args.fp16,
         local_rank=training_args.local_rank,
@@ -103,7 +95,7 @@ def gen_ls_gpt_config(training_args, config):
     return gpt_config
 
 
-def get_hf_gpt_layer_params(layer, gpt_config):
+def get_hf_gpt_enc_layer_params(layer, gpt_config):
     init_ws = []
     init_bs = []
 
@@ -129,8 +121,8 @@ def get_hf_gpt_layer_params(layer, gpt_config):
     return init_ws, init_bs
 
 
-def ls_hf_gpt_convert(model, training_args, config):
+def ls_hf_gpt_enc_convert(model, training_args, config):
     for i in range(config.num_hidden_layers):
-        model.transformer.h[i] = LSHFGptLayer.from_huggingface(
+        model.transformer.h[i] = LSHFGptEncoderLayer.from_huggingface(
             model.transformer.h[i], training_args, config
         ).cuda()
diff --git a/lightseq/training/ops/pytorch/layer_base.py b/lightseq/training/ops/pytorch/layer_base.py
index 12ecd07d..e8a009e1 100644
--- a/lightseq/training/ops/pytorch/layer_base.py
+++ b/lightseq/training/ops/pytorch/layer_base.py
@@ -76,6 +76,7 @@ class Config:
             local_rank: int  # rank in local node
             nlayer: int  # number of layers
             activation_fn: str = "relu"  # relu or gelu
+            has_cross_attn: bool = True
 
         if "model" in kwargs:
             if kwargs["model"] not in MODEL_ARCH:
diff --git a/lightseq/training/ops/pytorch/quantization.py b/lightseq/training/ops/pytorch/quantization.py
index f3676603..fe284671 100644
--- a/lightseq/training/ops/pytorch/quantization.py
+++ b/lightseq/training/ops/pytorch/quantization.py
@@ -1,7 +1,5 @@
-from audioop import bias
-import torch
 import torch.nn.functional as F
-from torch.nn import Parameter, Linear
+from torch.nn import Linear
 from lightseq.training.pytorch_quantization.tensor_quant import (
     QuantDescriptor,
     QUANT_DESC_8BIT_PER_TENSOR,
@@ -29,20 +27,16 @@
 class QuantLinear(Linear):
     def __init__(self, in_features, out_features, pre_activation=None, *args, **kwargs):
         super(QuantLinear, self).__init__(in_features, out_features, *args, **kwargs)
-        if pre_activation is None or pre_activation == "encoder_out":
-            input_quant_config = act_quant_config
-        elif pre_activation == "relu":
+        if pre_activation == "relu":
             input_quant_config = relu_quant_config
         else:
-            raise NotImplementedError(
-                f"pre_activation {pre_activation} is not supported"
-            )
+            input_quant_config = act_quant_config
 
         self.input_quant = None
         if pre_activation != "encoder_out":
             self.input_quant = TensorQuantizer(input_quant_config)
         self.output_quant = None
-        if pre_activation != "relu" and pre_activation != "encoder_out":
+        if pre_activation is None:
             self.output_quant = TensorQuantizer(act_quant_config)
         self.weight_quant = TensorQuantizer(weight_quant_config)
 
diff --git a/lightseq/training/ops/pytorch/torch_transformer_layers.py b/lightseq/training/ops/pytorch/torch_transformer_layers.py
index 51a44045..9f141d8d 100644
--- a/lightseq/training/ops/pytorch/torch_transformer_layers.py
+++ b/lightseq/training/ops/pytorch/torch_transformer_layers.py
@@ -6,12 +6,11 @@
 import math
 import uuid
 
-from typing import Dict, Optional, Tuple, List
+from typing import Dict, Optional, List
 
 import torch
-import torch.nn.functional as F
 from torch import Tensor, nn
-from torch.nn import Parameter, LayerNorm, Dropout, Linear
+from torch.nn import Parameter, LayerNorm, Dropout
 
 from lightseq.training.ops.pytorch import util
 from lightseq.training.ops.pytorch.layer_base import (
@@ -27,6 +26,11 @@
 )
 
 
+def copy_para(x, fp16):
+    y = util.copy_para(x)
+    return y.half() if fp16 else y.float()
+
+
 class MultiheadAttention(nn.Module):
     """Multi-headed attention.
 
@@ -46,12 +50,14 @@ def __init__(
         self_attention=False,
         encoder_decoder_attention=False,
         is_decoder=False,
+        has_causal_mask=False,
     ):
         super().__init__()
         self.embed_dim = embed_dim
         self.kdim = kdim if kdim is not None else embed_dim
         self.vdim = vdim if vdim is not None else embed_dim
         self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim
+        self.has_causal_mask = has_causal_mask
 
         self.num_heads = num_heads
         self.dropout_module = Dropout(dropout)
@@ -66,6 +72,15 @@ def __init__(
         self.encoder_decoder_attention = encoder_decoder_attention
         self.is_decoder = is_decoder
 
+        max_positions = 1024
+        self.register_buffer(
+            "bias",
+            torch.tril(
+                torch.ones((max_positions, max_positions), dtype=torch.uint8)
+            ).view(1, max_positions, max_positions),
+        )
+        self.register_buffer("masked_bias", torch.tensor(-1e4))
+
         assert (
             not self.self_attention or self.qkv_same_dim
         ), "Self-attention requires query, key and value to be of the same size"
@@ -316,6 +331,15 @@ def forward(
 
         assert list(attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len]
 
+        if self.has_causal_mask:
+            query_length, key_length = query.size(0), key.size(0)
+            causal_mask = self.bias[
+                :, key_length - query_length : key_length, :key_length
+            ].bool()
+            attn_weights = torch.where(
+                causal_mask, attn_weights, self.masked_bias.to(attn_weights.dtype)
+            )
+
         if attn_mask is not None:
             attn_mask = attn_mask.unsqueeze(0)
             if self.onnx_trace:
@@ -355,6 +379,7 @@ def forward(
         else:
             attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
         attn = self.out_proj(attn)
+
         attn_weights: Optional[Tensor] = None
         if need_weights:
             attn_weights = attn_weights_float.view(
@@ -535,11 +560,44 @@ def __init__(self, config, initial_weights=None, initial_biases=None):
             config.intermediate_size,
         )
         self.fc2 = QuantLinear(
-            config.intermediate_size, self.embed_dim, pre_activation="relu"
+            config.intermediate_size,
+            self.embed_dim,
+            pre_activation=config.activation_fn,
         )
 
         self.final_layer_norm = LayerNorm(self.embed_dim)
 
+        if initial_weights is None or initial_biases is None:
+            return
+
+        # load initial weights
+        self.self_attn.qkv_proj.weight.data.copy_(
+            copy_para(torch.cat(initial_weights[:3], 0), config.fp16)
+        )
+        self.self_attn.qkv_proj.bias.data.copy_(
+            copy_para(torch.cat(initial_biases[:3], 0), config.fp16)
+        )
+        self.self_attn.out_proj.weight.data.copy_(
+            copy_para(initial_weights[3], config.fp16)
+        )
+        self.self_attn.out_proj.bias.data.copy_(
+            copy_para(initial_biases[3], config.fp16)
+        )
+        self.self_attn_layer_norm.weight.data.copy_(
+            copy_para(initial_weights[4], config.fp16)
+        )
+        self.self_attn_layer_norm.bias.data.copy_(
+            copy_para(initial_biases[4], config.fp16)
+        )
+        self.fc1.weight.data.copy_(copy_para(initial_weights[5], config.fp16))
+        self.fc1.bias.data.copy_(copy_para(initial_biases[5], config.fp16))
+        self.fc2.weight.data.copy_(copy_para(initial_weights[6], config.fp16))
+        self.fc2.bias.data.copy_(copy_para(initial_biases[6], config.fp16))
+        self.final_layer_norm.weight.data.copy_(
+            copy_para(initial_weights[7], config.fp16)
+        )
+        self.final_layer_norm.bias.data.copy_(copy_para(initial_biases[7], config.fp16))
+
     def build_self_attention(self, embed_dim, nhead, attn_dropout):
         return MultiheadAttention(
             embed_dim,
@@ -625,27 +683,29 @@ def __init__(self, config, initial_weights=None, initial_biases=None):
         super().__init__()
         self.embed_dim = config.hidden_size
         self.dropout_module = Dropout(config.hidden_dropout_ratio)
-        self.cross_self_attention = False
 
         self.self_attn = self.build_self_attention(
             self.embed_dim,
             config.nhead,
             config.attn_prob_dropout_ratio,
+            has_causal_mask=not config.has_cross_attn,
         )
 
         self.activation_fn = util.get_activation_fn(activation=config.activation_fn)
         self.activation_dropout_module = Dropout(float(config.activation_dropout_ratio))
         self.normalize_before = config.pre_layer_norm
+        self.has_cross_attn = config.has_cross_attn
 
         self.self_attn_layer_norm = LayerNorm(self.embed_dim)
 
-        self.encoder_attn = self.build_encoder_attention(
-            self.embed_dim,
-            config.hidden_size,
-            config.attn_prob_dropout_ratio,
-            config.nhead,
-        )
-        self.encoder_attn_layer_norm = LayerNorm(self.embed_dim)
+        if config.has_cross_attn:
+            self.encoder_attn = self.build_encoder_attention(
+                self.embed_dim,
+                config.hidden_size,
+                config.attn_prob_dropout_ratio,
+                config.nhead,
+            )
+            self.encoder_attn_layer_norm = LayerNorm(self.embed_dim)
 
         self.fc1 = QuantLinear(
             self.embed_dim,
@@ -654,7 +714,7 @@ def __init__(self, config, initial_weights=None, initial_biases=None):
         self.fc2 = QuantLinear(
             config.intermediate_size,
             self.embed_dim,
-            pre_activation="relu",
+            pre_activation=config.activation_fn,
         )
 
         self.final_layer_norm = LayerNorm(self.embed_dim)
@@ -662,8 +722,89 @@ def __init__(self, config, initial_weights=None, initial_biases=None):
 
         self.onnx_trace = False
 
+        if initial_weights is None or initial_biases is None:
+            return
+
+        # load initial weights
+        self.self_attn.qkv_proj.weight.data.copy_(
+            copy_para(torch.cat(initial_weights[:3], 0), config.fp16)
+        )
+        self.self_attn.qkv_proj.bias.data.copy_(
+            copy_para(torch.cat(initial_biases[:3], 0), config.fp16)
+        )
+        self.self_attn.out_proj.weight.data.copy_(
+            copy_para(initial_weights[3], config.fp16)
+        )
+        self.self_attn.out_proj.bias.data.copy_(
+            copy_para(initial_biases[3], config.fp16)
+        )
+        self.self_attn_layer_norm.weight.data.copy_(
+            copy_para(initial_weights[4], config.fp16)
+        )
+        self.self_attn_layer_norm.bias.data.copy_(
+            copy_para(initial_biases[4], config.fp16)
+        )
+        if config.has_cross_attn:
+            self.encoder_attn.q_proj.weight.data.copy_(
+                copy_para(initial_weights[5], config.fp16)
+            )
+            self.encoder_attn.q_proj.bias.data.copy_(
+                copy_para(initial_weights[5], config.fp16)
+            )
+            self.encoder_attn.k_proj.weight.data.copy_(
+                copy_para(initial_weights[6], config.fp16)
+            )
+            self.encoder_attn.k_proj.bias.data.copy_(
+                copy_para(initial_weights[6], config.fp16)
+            )
+            self.encoder_attn.v_proj.weight.data.copy_(
+                copy_para(initial_weights[7], config.fp16)
+            )
+            self.encoder_attn.v_proj.bias.data.copy_(
+                copy_para(initial_weights[7], config.fp16)
+            )
+            self.encoder_attn.out_proj.weight.data.copy_(
+                copy_para(initial_weights[8], config.fp16)
+            )
+            self.encoder_attn.out_proj.bias.data.copy_(
+                copy_para(initial_biases[8], config.fp16)
+            )
+            self.encoder_attn_layer_norm.weight.data.copy_(
+                copy_para(initial_weights[9], config.fp16)
+            )
+            self.encoder_attn_layer_norm.bias.data.copy_(
+                copy_para(initial_biases[9], config.fp16)
+            )
+            self.fc1.weight.data.copy_(copy_para(initial_weights[10], config.fp16))
+            self.fc1.bias.data.copy_(copy_para(initial_biases[10], config.fp16))
+            self.fc2.weight.data.copy_(copy_para(initial_weights[11], config.fp16))
+            self.fc2.bias.data.copy_(copy_para(initial_biases[11], config.fp16))
+            self.final_layer_norm.weight.data.copy_(
+                copy_para(initial_weights[12], config.fp16)
+            )
+            self.final_layer_norm.bias.data.copy_(
+                copy_para(initial_biases[12], config.fp16)
+            )
+        else:
+            self.fc1.weight.data.copy_(copy_para(initial_weights[5], config.fp16))
+            self.fc1.bias.data.copy_(copy_para(initial_biases[5], config.fp16))
+            self.fc2.weight.data.copy_(copy_para(initial_weights[6], config.fp16))
+            self.fc2.bias.data.copy_(copy_para(initial_biases[6], config.fp16))
+            self.final_layer_norm.weight.data.copy_(
+                copy_para(initial_weights[7], config.fp16)
+            )
+            self.final_layer_norm.bias.data.copy_(
+                copy_para(initial_biases[7], config.fp16)
+            )
+
     def build_self_attention(
-        self, embed_dim, nhead, attn_dropout, add_bias_kv=False, add_zero_attn=False
+        self,
+        embed_dim,
+        nhead,
+        attn_dropout,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        has_causal_mask=False,
     ):
         return MultiheadAttention(
             embed_dim,
@@ -671,8 +812,9 @@ def build_self_attention(
             dropout=attn_dropout,
             add_bias_kv=add_bias_kv,
             add_zero_attn=add_zero_attn,
-            self_attention=not self.cross_self_attention,
+            self_attention=True,
             is_decoder=True,
+            has_causal_mask=has_causal_mask,
         )
 
     def build_encoder_attention(
@@ -720,7 +862,6 @@ def forward(
         Returns:
             encoded output of shape `(seq_len, batch, embed_dim)`
         """
-
         if need_head_weights:
             need_attn = True
         x = x.transpose(0, 1)
@@ -737,35 +878,11 @@ def forward(
                 saved_state["prev_key_padding_mask"] = prev_self_attn_state[2]
             assert incremental_state is not None
             self.self_attn._set_input_buffer(incremental_state, saved_state)
-        _self_attn_input_buffer = self.self_attn._get_input_buffer(incremental_state)
-        if self.cross_self_attention and not (
-            incremental_state is not None
-            and _self_attn_input_buffer is not None
-            and "prev_key" in _self_attn_input_buffer
-        ):
-            if self_attn_mask is not None:
-                assert encoder_out is not None
-                self_attn_mask = torch.cat(
-                    (x.new_zeros(x.size(0), encoder_out.size(0)), self_attn_mask), dim=1
-                )
-            if self_attn_padding_mask is not None:
-                if encoder_padding_mask is None:
-                    assert encoder_out is not None
-                    encoder_padding_mask = self_attn_padding_mask.new_zeros(
-                        encoder_out.size(1), encoder_out.size(0)
-                    )
-                self_attn_padding_mask = torch.cat(
-                    (encoder_padding_mask, self_attn_padding_mask), dim=1
-                )
-            assert encoder_out is not None
-            y = torch.cat((encoder_out, x), dim=0)
-        else:
-            y = x
 
         x, attn = self.self_attn(
             query=x,
-            key=y,
-            value=y,
+            key=x,
+            value=x,
             key_padding_mask=self_attn_padding_mask,
             incremental_state=incremental_state,
             need_weights=False,
@@ -776,7 +893,7 @@ def forward(
         if not self.normalize_before:
             x = self.self_attn_layer_norm(x)
 
-        if self.encoder_attn is not None and encoder_out is not None:
+        if self.has_cross_attn and encoder_out is not None:
             if (
                 encoder_out.shape[1] != x.shape[1]
                 and x.shape[1] % encoder_out.shape[1] == 0
@@ -847,7 +964,7 @@ def make_generation_fast_(self, need_attn: bool = False, **kwargs):
 
 
 class TransformerEmbeddingLayer(TransformerEmbeddingLayerBase):
-    def __init__(self, config):
+    def __init__(self, config, initial_embeddings=None):
         super().__init__()
 
         self.emb_lookup = nn.Embedding(
@@ -855,9 +972,15 @@ def __init__(self, config):
         )
         self.emb_lookup.to(dtype=(torch.half if config.fp16 else torch.float))
         self.embeddings = self.emb_lookup.weight
-
         nn.init.normal_(self.embeddings, mean=0, std=config.embedding_dim**-0.5)
         nn.init.constant_(self.embeddings[config.padding_idx], 0)
+
+        # load initial weights
+        if initial_embeddings is not None:
+            self.emb_lookup.weight.data.copy_(
+                copy_para(initial_embeddings, config.fp16)
+            )
+
         self.embed_positions = SinusoidalPositionalEmbedding(
             config.embedding_dim, config.padding_idx, config.max_seq_len, config.fp16
         )
@@ -941,3 +1064,79 @@ def forward(
             .view(bsz, seq_len, -1)
             * mask
         ).detach()
+
+
+class BertEmbeddingLayer(TransformerEmbeddingLayerBase):
+    def __init__(self, config, initial_weights=None):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(
+            config.vocab_size, config.embedding_dim, padding_idx=config.padding_idx
+        )
+        self.position_embeddings = nn.Embedding(
+            config.max_seq_len, config.embedding_dim
+        )
+        self.token_type_embeddings = nn.Embedding(
+            config.type_vocab_size, config.embedding_dim
+        )
+
+        self.LayerNorm = nn.LayerNorm(config.embedding_dim, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.dropout)
+
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_seq_len).expand((1, -1))
+        )
+        self.register_buffer(
+            "token_type_ids",
+            torch.zeros(self.position_ids.size(), dtype=torch.long),
+            persistent=False,
+        )
+
+        self.emb_quant = TensorQuantizer(weight_quant_config)
+
+        if initial_weights is None:
+            return
+
+        # load initial weights
+        self.word_embeddings.weight.data.copy_(
+            copy_para(initial_weights[0], config.fp16)
+        )
+        self.position_embeddings.weight.data.copy_(
+            copy_para(initial_weights[1], config.fp16)
+        )
+        self.token_type_embeddings.weight.data.copy_(
+            copy_para(initial_weights[2], config.fp16)
+        )
+        self.LayerNorm.weight.data.copy_(copy_para(initial_weights[3], config.fp16))
+        self.LayerNorm.bias.data.copy_(copy_para(initial_weights[4], config.fp16))
+
+    def forward(
+        self,
+        input_ids=None,
+        token_type_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+        past_key_values_length=0,
+    ):
+        assert input_ids is not None
+        assert position_ids is None
+        assert inputs_embeds is None
+        assert torch.all(token_type_ids == 0)
+
+        input_shape = input_ids.size()
+        seq_length = input_shape[1]
+        position_ids = self.position_ids[:, :seq_length]
+
+        token_type_ids = self.token_type_ids[:, :seq_length].expand(
+            input_shape[0], seq_length
+        )
+
+        inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        embeddings = self.emb_quant(embeddings)
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
diff --git a/lightseq/training/ops/pytorch/transformer_decoder_layer.py b/lightseq/training/ops/pytorch/transformer_decoder_layer.py
index 08fd08c3..6e722f0d 100644
--- a/lightseq/training/ops/pytorch/transformer_decoder_layer.py
+++ b/lightseq/training/ops/pytorch/transformer_decoder_layer.py
@@ -1,17 +1,13 @@
 import math
-from dataclasses import dataclass
 
 import torch
 from torch import nn
 from torch.autograd import Function
 
 from lightseq.training.ops.pytorch import transformer_cuda_module
-from lightseq.training.ops.pytorch.builder import TransformerBuilder
 from lightseq.training.ops.pytorch.util import (
     copy_para,
     state_dict,
-    MODEL_ARCH,
-    check_config,
     calc_offset,
 )
 from lightseq.training.ops.pytorch.layer_base import TransformerDecoderLayerBase
@@ -160,7 +156,7 @@ def __init__(self, config, initial_weights=None, initial_biases=None):
             self.para_offset = self.para_offset[:-2]
         self.para = nn.Parameter(torch.Tensor(self.para_offset[-1]))
 
-        if initial_weights is None and initial_biases is None:
+        if initial_weights is None or initial_biases is None:
             # enc-dec kv weights and bias
             self.init_transformer_weights()
             return
diff --git a/lightseq/training/ops/pytorch/transformer_embedding_layer.py b/lightseq/training/ops/pytorch/transformer_embedding_layer.py
index dd4c9695..58f77618 100644
--- a/lightseq/training/ops/pytorch/transformer_embedding_layer.py
+++ b/lightseq/training/ops/pytorch/transformer_embedding_layer.py
@@ -1,12 +1,8 @@
-import math
-from dataclasses import dataclass
-
 import torch
 from torch import nn
 from torch.autograd import Function
 
 from lightseq.training.ops.pytorch import transformer_cuda_module
-from lightseq.training.ops.pytorch.builder import TransformerBuilder
 from lightseq.training.ops.pytorch.util import state_dict, get_pos_embedding
 from lightseq.training.ops.pytorch.layer_base import TransformerEmbeddingLayerBase
 
diff --git a/lightseq/training/ops/pytorch/transformer_encoder_layer.py b/lightseq/training/ops/pytorch/transformer_encoder_layer.py
index 6d94587d..b249d12c 100644
--- a/lightseq/training/ops/pytorch/transformer_encoder_layer.py
+++ b/lightseq/training/ops/pytorch/transformer_encoder_layer.py
@@ -1,5 +1,4 @@
 import math
-from dataclasses import dataclass
 
 import torch
 from torch import nn
@@ -7,12 +6,9 @@
 
 from lightseq.training.ops.pytorch.layer_base import TransformerEncoderLayerBase
 from lightseq.training.ops.pytorch import transformer_cuda_module
-from lightseq.training.ops.pytorch.builder import TransformerBuilder
 from lightseq.training.ops.pytorch.util import (
     copy_para,
     state_dict,
-    MODEL_ARCH,
-    check_config,
     calc_offset,
 )
 
@@ -109,7 +105,7 @@ def __init__(self, config, initial_weights=None, initial_biases=None):
         self.para_offset = LSTransformerEncoderLayer.gen_offset(hs, ims)
         self.para = nn.Parameter(torch.Tensor(self.para_offset[-1]))
 
-        if initial_weights is None and initial_biases is None:
+        if initial_weights is None or initial_biases is None:
             self.init_transformer_weights()
             return
 
@@ -134,33 +130,6 @@ def __init__(self, config, initial_weights=None, initial_biases=None):
             cur_para.copy_(b.view(-1))
             idx += 1
 
-    @staticmethod
-    def get_config(**kwargs):
-        @dataclass
-        class Config:
-            max_batch_tokens: int  # max batch token numbers
-            max_seq_len: int  # max sequence length
-            hidden_size: int  # size of transformer hidden layers
-            intermediate_size: int  # size of ffn inner size
-            nhead: int  # number of heads in attention
-            attn_prob_dropout_ratio: float  # attention score dropout ratio
-            activation_dropout_ratio: float  # ffn activation dropout ratio
-            hidden_dropout_ratio: float  # dropout ration before residual
-            pre_layer_norm: bool  # pre layer norm or post
-            fp16: bool  # fp16 presion
-            local_rank: int  # rank in local node
-            activation_fn: str = "relu"  # relu or gelu
-
-        if "model" in kwargs:
-            if kwargs["model"] not in MODEL_ARCH:
-                raise ValueError("{} architecture is not supported.")
-            MODEL_ARCH[kwargs["model"]](kwargs)
-            del kwargs["model"]
-
-        config = Config(**kwargs)
-        check_config(config)
-        return config
-
     @staticmethod
     def gen_offset(hidden_size, intermediate_size):
         hs, ims = hidden_size, intermediate_size
diff --git a/tests/cublas/gemm.h b/tests/cublas/gemm.h
index 0e00d593..75fbe2fa 100644
--- a/tests/cublas/gemm.h
+++ b/tests/cublas/gemm.h
@@ -101,32 +101,39 @@ int cublas_lt_matmul(cublasLtHandle_t handle, cublasLtMatmulDesc_t matmulDesc,
   }
 }
 
+int cublas_lt_matmul_int8(cublasLtHandle_t handle,
+                          cublasLtMatmulDesc_t matmulDesc,
+                          cublasLtMatrixLayout_t XDesc,
+                          cublasLtMatrixLayout_t WDesc,
+                          cublasLtMatrixLayout_t YDesc, int8_t *A, int8_t *B,
+                          int8_t *C, float *alpha, float *beta) {
+  cublasStatus_t status;
+  status = cublasLtMatmul(handle, matmulDesc, alpha, A, XDesc, B, WDesc, beta,
+                          C, YDesc, C, YDesc, nullptr, nullptr, 0, 0);
+
+  if (status == CUBLAS_STATUS_SUCCESS)
+    return 1;
+  else {
+    return -1;
+  }
+}
+
 template <typename T, typename S>
 float test_lt_matmul(cublasLtHandle_t handle, int C, int B, int O, int H, T *X,
                      T *W, S *Y, S *alpha, S *beta, int iteration) {
   cudaDataType_t XType, WType, YType;
-#if CUBLAS_VER_MAJOR == 11
+
   cublasComputeType_t ComputeType;
   cudaDataType_t scaleType;
-#else
-  cudaDataType_t ComputeType;
-#endif
+
   if (std::is_same<T, float>::value) {
     XType = WType = YType = CUDA_R_32F;
-#if CUBLAS_VER_MAJOR == 11
     ComputeType = CUBLAS_COMPUTE_32F;
     scaleType = CUDA_R_32F;
-#else
-    ComputeType = CUDA_R_32F;
-#endif
   } else if (std::is_same<T, __half>::value) {
     XType = WType = YType = CUDA_R_16F;
-#if CUBLAS_VER_MAJOR == 11
     ComputeType = CUBLAS_COMPUTE_16F;
     scaleType = CUDA_R_16F;
-#else
-    ComputeType = CUDA_R_16F;
-#endif
   } else {
     printf("Not supported data type.");
     return -1;
@@ -185,12 +192,8 @@ float test_lt_matmul(cublasLtHandle_t handle, int C, int B, int O, int H, T *X,
       NULL, Wtransform, WtransformDesc, 0));
 
   cublasLtMatmulDesc_t matmulDesc;
-#if CUBLAS_VER_MAJOR == 11
   checkCublasStatus(
       cublasLtMatmulDescCreate(&matmulDesc, ComputeType, scaleType));
-#else
-  checkCublasStatus(cublasLtMatmulDescCreate(&matmulDesc, ComputeType));
-#endif
 
   float total_time = 0;
   for (int i = 0; i < iteration; ++i) {
@@ -223,23 +226,97 @@ float test_lt_matmul(cublasLtHandle_t handle, int C, int B, int O, int H, T *X,
   return total_time > 0 ? total_time / (iteration - 1) : -1;
 }
 
-float test_lt_matmul_int8(cublasLtHandle_t handle, int C, int B, int O, int H,
-                          int8_t *X, int8_t *W, int32_t *Y, int32_t *alpha,
-                          int32_t *beta, int iteration) {
-#if CUBLAS_VER_MAJOR == 11
+float test_lt_matmul_int8_col(cublasLtHandle_t handle, int C, int B, int O,
+                              int H, int8_t *X, int8_t *W, int8_t *Y,
+                              float *alpha, float *beta, int iteration) {
+  cublasComputeType_t ComputeType = CUBLAS_COMPUTE_32I;
+  cudaDataType_t scaleType = CUDA_R_32F;
+  cudaDataType_t XType, WType, YType;
+  XType = WType = CUDA_R_8I;
+  YType = CUDA_R_8I;
+
+  int64_t strideX = B * H, strideW = O * H, strideY = B * O;
+  cublasOperation_t opTrans = CUBLAS_OP_T;
+  cublasLtOrder_t order_W;
+
+  cublasLtMatrixLayout_t XDesc, WDesc, YDesc;
+  checkCublasStatus(cublasLtMatrixLayoutCreate(&XDesc, XType, H, B, H));
+  checkCublasStatus(cublasLtMatrixLayoutCreate(&WDesc, WType, H, O, H));
+  checkCublasStatus(cublasLtMatrixLayoutCreate(&YDesc, YType, O, B, O));
+  if (C > 1) {
+    checkCublasStatus(cublasLtMatrixLayoutSetAttribute(
+        XDesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &C, sizeof(C)));
+    checkCublasStatus(cublasLtMatrixLayoutSetAttribute(
+        XDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideX,
+        sizeof(strideX)));
+    checkCublasStatus(cublasLtMatrixLayoutSetAttribute(
+        WDesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &C, sizeof(C)));
+    checkCublasStatus(cublasLtMatrixLayoutSetAttribute(
+        WDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideW,
+        sizeof(strideW)));
+    checkCublasStatus(cublasLtMatrixLayoutSetAttribute(
+        YDesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &C, sizeof(C)));
+    checkCublasStatus(cublasLtMatrixLayoutSetAttribute(
+        YDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideY,
+        sizeof(strideY)));
+  }
+
+  cublasLtMatmulDesc_t matmulDesc;
+  checkCublasStatus(
+      cublasLtMatmulDescCreate(&matmulDesc, ComputeType, scaleType));
+  checkCublasStatus(cublasLtMatmulDescSetAttribute(
+      matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, &opTrans, sizeof(opTrans)));
+
+  float total_time = 0;
+  for (int i = 0; i < iteration; ++i) {
+    cudaEvent_t start, stop;
+    float time;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+
+    cudaEventRecord(start, 0);
+    int success = cublas_lt_matmul_int8(handle, matmulDesc, WDesc, XDesc, YDesc,
+                                        W, X, Y, alpha, beta);
+    cudaEventRecord(stop, 0);
+    cudaEventSynchronize(stop);
+
+    cudaEventElapsedTime(&time, start, stop);
+    cudaEventDestroy(start);
+    cudaEventDestroy(stop);
+    if (success > 0 && i >= 1) total_time += time;
+  }
+
+  checkCublasStatus(cublasLtMatrixLayoutDestroy(XDesc));
+  checkCublasStatus(cublasLtMatrixLayoutDestroy(WDesc));
+  checkCublasStatus(cublasLtMatrixLayoutDestroy(YDesc));
+  checkCublasStatus(cublasLtMatmulDescDestroy(matmulDesc));
+  cudaDeviceSynchronize();
+
+  return total_time > 0 ? total_time / (iteration - 1) : -1;
+}
+
+float test_lt_matmul_int8_col32(cublasLtHandle_t handle, int C, int B, int O,
+                                int H, int8_t *X, int8_t *W, int8_t *Y,
+                                float *alpha, float *beta, int iteration,
+                                int order) {
   cublasComputeType_t ComputeType = CUBLAS_COMPUTE_32I;
-  cudaDataType_t scaleType = CUDA_R_32I;
-#else
-  cudaDataType_t ComputeType = CUDA_R_32I;
-#endif
+  cudaDataType_t scaleType = CUDA_R_32F;
   cudaDataType_t XType, WType, YType;
   XType = WType = CUDA_R_8I;
-  YType = CUDA_R_32I;
+  YType = CUDA_R_8I;
 
   int64_t strideX = B * H, strideW = O * H, strideY = B * O;
   cublasOperation_t opTrans = CUBLAS_OP_T;
   cublasLtOrder_t order_COL32 = CUBLASLT_ORDER_COL32;
   cublasLtOrder_t order_COL4_4R2_8C = CUBLASLT_ORDER_COL4_4R2_8C;
+  cublasLtOrder_t order_COL32_2R_4R4 = CUBLASLT_ORDER_COL32_2R_4R4;
+  cublasLtOrder_t order_W;
+
+  if (order == 0) {
+    order_W = order_COL4_4R2_8C;
+  } else if (order == 1) {
+    order_W = order_COL32_2R_4R4;
+  }
 
   cublasLtMatrixLayout_t XDesc, WDesc, YDesc;
   checkCublasStatus(cublasLtMatrixLayoutCreate(&XDesc, XType, H, B, H));
@@ -264,33 +341,42 @@ float test_lt_matmul_int8(cublasLtHandle_t handle, int C, int B, int O, int H,
   }
 
   int8_t *Xtransform, *Wtransform;
-  int32_t *Ytransform;
+  int8_t *Ytransform;
   checkCudaStatus(cudaMalloc(reinterpret_cast<void **>(&Xtransform),
                              sizeof(int8_t) * C * B * H));
   checkCudaStatus(cudaMalloc(reinterpret_cast<void **>(&Wtransform),
                              sizeof(int8_t) * C * O * H));
   checkCudaStatus(cudaMalloc(reinterpret_cast<void **>(&Ytransform),
-                             sizeof(int32_t) * C * B * O));
+                             sizeof(int8_t) * C * B * O));
+
+  int ldXtransform, ldWtransform, ldYtransform;
+  if (order == 0) {
+    ldXtransform = 32 * B;
+    ldWtransform = 32 * round_up(O, 8);
+    ldYtransform = 32 * B;
+  } else {
+    ldXtransform = 32 * B;
+    ldWtransform = 32 * round_up(O, 32);
+    ldYtransform = 32 * B;
+  }
 
-  int ldXtransform = 32 * B;
-  int ldWtransform = 32 * O;
-  int ldYtransform = 32 * B;
   cublasLtMatrixLayout_t XtransformDesc, WtransformDesc, YtransformDesc;
   checkCublasStatus(cublasLtMatrixLayoutCreate(&XtransformDesc, CUDA_R_8I, B, H,
                                                ldXtransform));
   checkCublasStatus(cublasLtMatrixLayoutCreate(&WtransformDesc, CUDA_R_8I, O, H,
                                                ldWtransform));
-  checkCublasStatus(cublasLtMatrixLayoutCreate(&YtransformDesc, CUDA_R_32I, B,
-                                               O, ldYtransform));
+  checkCublasStatus(cublasLtMatrixLayoutCreate(&YtransformDesc, CUDA_R_8I, B, O,
+                                               ldYtransform));
+
   checkCublasStatus(cublasLtMatrixLayoutSetAttribute(
       YtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32,
       sizeof(order_COL32)));
   checkCublasStatus(cublasLtMatrixLayoutSetAttribute(
-      WtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL4_4R2_8C,
-      sizeof(order_COL4_4R2_8C)));
+      WtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_W, sizeof(order_W)));
   checkCublasStatus(cublasLtMatrixLayoutSetAttribute(
       XtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32,
       sizeof(order_COL32)));
+
   if (C > 1) {
     checkCublasStatus(cublasLtMatrixLayoutSetAttribute(
         XtransformDesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &C, sizeof(C)));
@@ -325,12 +411,8 @@ float test_lt_matmul_int8(cublasLtHandle_t handle, int C, int B, int O, int H,
       NULL, Wtransform, WtransformDesc, 0));
 
   cublasLtMatmulDesc_t matmulDesc;
-#if CUBLAS_VER_MAJOR == 11
   checkCublasStatus(
       cublasLtMatmulDescCreate(&matmulDesc, ComputeType, scaleType));
-#else
-  checkCublasStatus(cublasLtMatmulDescCreate(&matmulDesc, ComputeType));
-#endif
   checkCublasStatus(cublasLtMatmulDescSetAttribute(
       matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opTrans, sizeof(opTrans)));
 
@@ -342,9 +424,9 @@ float test_lt_matmul_int8(cublasLtHandle_t handle, int C, int B, int O, int H,
     cudaEventCreate(&stop);
 
     cudaEventRecord(start, 0);
-    int success = cublas_lt_matmul(handle, matmulDesc, XtransformDesc,
-                                   WtransformDesc, YtransformDesc, Xtransform,
-                                   Wtransform, Ytransform, alpha, beta);
+    int success = cublas_lt_matmul_int8(
+        handle, matmulDesc, XtransformDesc, WtransformDesc, YtransformDesc,
+        Xtransform, Wtransform, Ytransform, alpha, beta);
     cudaEventRecord(stop, 0);
     cudaEventSynchronize(stop);
 
diff --git a/tests/cublas/test.cpp b/tests/cublas/test.cpp
index 9b1e7de5..8b3d10a3 100644
--- a/tests/cublas/test.cpp
+++ b/tests/cublas/test.cpp
@@ -1,7 +1,7 @@
 #include "gemm.h"
 
-vf _main(std::string name, int C, int B, int O, int H, int iteration,
-         bool debug) {
+void _main(std::string name, int C, int B, int O, int H, int iteration,
+           bool debug) {
   printf(
       ">>>>>>>>>>>>>>>>>>>> %s, shape: X(%d, %d, %d), W(%d, %d, %d) "
       ">>>>>>>>>>>>>>>>>>>>\n",
@@ -12,15 +12,17 @@ vf _main(std::string name, int C, int B, int O, int H, int iteration,
 
   float *fX, *fW, *fY;
   __half *hX, *hW, *hY;
-  int8_t *iX, *iW;
-  int32_t *iY;
+  int8_t *iX, *iW, *i8Y;
+  int32_t *i32Y;
   allocate_memory(C, B, O, H, &fX, &fW, &fY);
   allocate_memory(C, B, O, H, &hX, &hW, &hY);
-  allocate_memory(C, B, O, H, &iX, &iW, &iY);
+  allocate_memory(C, B, O, H, &iX, &iW, &i8Y);
+  checkCudaStatus(cudaMallocManaged(&i32Y, C * B * O * sizeof(int32_t)));
 
   float f_alpha = 1, f_beta = 0;
   __half h_alpha = __float2half_rn(1.0), h_beta = __float2half_rn(0.0);
   int32_t i_alpha = 1, i_beta = 0;
+  float i8_out_scale = 1.0 / (127 * H / 2.951);
 
   init_data(fX, hX, iX, fW, hW, iW, C, B, O, H);
 
@@ -31,59 +33,54 @@ vf _main(std::string name, int C, int B, int O, int H, int iteration,
   checkCublasStatus(cublasCreate(&handle));
   checkCublasStatus(cublasLtCreate(&lt_handle));
 
-  float cublas_ft = -1, cublas_ht = -1, cublas_it = -1;
-  float cublaslt_ft = -1, cublaslt_ht = -1, cublaslt_it = -1;
+  float t = -1;
 
   printf(">>>>> test cublas gemm ex >>>>>\n");
-  cublas_ft = test_gemm_ex(handle, C, B, O, H, fX, fW, fY, &f_alpha, &f_beta,
-                           iteration);
-  cublas_ht = test_gemm_ex(handle, C, B, O, H, hX, hW, hY, &h_alpha, &h_beta,
-                           iteration);
-  cublas_it = test_gemm_ex(handle, C, B, O, H, iX, iW, iY, &i_alpha, &i_beta,
-                           iteration);
-  print_res(Y, fY, hY, iY, C, B, O, H, cublas_ft, cublas_ht, cublas_it, debug);
+  t = test_gemm_ex(handle, C, B, O, H, fX, fW, fY, &f_alpha, &f_beta,
+                   iteration);
+  print_res(fY, fY, t, C, B, O, H, "cublas fp32", debug);
+  t = test_gemm_ex(handle, C, B, O, H, hX, hW, hY, &h_alpha, &h_beta,
+                   iteration);
+  print_res(fY, hY, t, C, B, O, H, "cublas fp16", debug);
+  t = test_gemm_ex(handle, C, B, O, H, iX, iW, i32Y, &i_alpha, &i_beta,
+                   iteration);
+  print_res(fY, i32Y, t, C, B, O, H, "cublas int8", debug);
 
   if (C == 1) {
     printf(">>>>> test cublas lt matmul >>>>>\n");
-    cublaslt_ft = test_lt_matmul(lt_handle, C, B, O, H, fX, fW, fY, &f_alpha,
-                                 &f_beta, iteration);
-    cublaslt_ht = test_lt_matmul(lt_handle, C, B, O, H, hX, hW, hY, &h_alpha,
-                                 &h_beta, iteration);
-    cublaslt_it = test_lt_matmul_int8(lt_handle, C, B, O, H, iX, iW, iY,
-                                      &i_alpha, &i_beta, iteration);
-    print_res(Y, fY, hY, iY, C, B, O, H, cublaslt_ft, cublaslt_ht, cublaslt_it,
-              debug);
+    t = test_lt_matmul(lt_handle, C, B, O, H, fX, fW, fY, &f_alpha, &f_beta,
+                       iteration);
+    print_res(fY, fY, t, C, B, O, H, "lt fp32", debug);
+    t = test_lt_matmul(lt_handle, C, B, O, H, hX, hW, hY, &h_alpha, &h_beta,
+                       iteration);
+    print_res(fY, hY, t, C, B, O, H, "lt fp16", debug);
+    t = test_lt_matmul_int8_col(lt_handle, C, B, O, H, iX, iW, i8Y,
+                                &i8_out_scale, &f_beta, iteration);
+    print_res(fY, i8Y, t, C, B, O, H, "lt_col int8", debug);
+    t = test_lt_matmul_int8_col32(lt_handle, C, B, O, H, iX, iW, i8Y,
+                                  &i8_out_scale, &f_beta, iteration, 0);
+    print_res(fY, i8Y, t, C, B, O, H, "lt_col4_4r2_8c int8", debug);
+    t = test_lt_matmul_int8_col32(lt_handle, C, B, O, H, iX, iW, i8Y,
+                                  &i8_out_scale, &f_beta, iteration, 1);
+    print_res(fY, i8Y, t, C, B, O, H, "lt_col32_2r_4r4 int8", debug);
   }
 
   // printf(">>>>> test tvm gemm >>>>>\n");
-  // float tvm_it = test_tvm_gemm(iX, iW, iY, iteration);
+  // float tvm_it = test_tvm_gemm(iX, iW, i32Y, iteration);
   // if (debug)
   //   for (int i = 0; i < 10; ++i)
-  //     printf("%.5f%c", float(iY[i]) / 127 / 127, " \n"[i == 9]);
+  //     printf("%.5f%c", float(i32Y[i]) / 127 / 127, " \n"[i == 9]);
   // float ie = 0;
   // for (int i = 0; i < C * B * O; ++i)
-  //   ie += fabs((debug ? Y[i] : fY[i]) - float(iY[i]) / 127 / 127);
+  //   ie += fabs((debug ? Y[i] : fY[i]) - float(i32Y[i]) / 127 / 127);
   // printf("  diff: %.5f\n", ie / C / B / O);
   // printf("  time: %.3f ms\n", tvm_it);
 
-  if (C == 1)
-    printf("SPEEDUP (cublas fp16 / lt fp16):     %.3f\n",
-           cublas_ht / cublaslt_ht);
-  printf("SPEEDUP (cublas fp16 / cublas int8): %.3f\n", cublas_ht / cublas_it);
-  if (C == 1)
-    printf("SPEEDUP (cublas fp16 / lt int8):     %.3f\n",
-           cublas_ht / cublaslt_it);
-
   free_memory(fX, fW, fY);
   free_memory(hX, hW, hY);
-  free_memory(iX, iW, iY);
+  free_memory(iX, iW, i8Y);
+  checkCudaStatus(cudaFree(i32Y));
   if (debug) checkCudaStatus(cudaFree(Y));
-
-  if (C == 1)
-    return {cublas_ht / cublaslt_ht, cublas_ht / cublas_it,
-            cublas_ht / cublaslt_it};
-  else
-    return {0, cublas_ht / cublas_it, 0};
 }
 
 int main() {
@@ -139,19 +136,10 @@ int main() {
            {batch_beam_size * head_num, head_dim, 1, step});
   }
 
-  vf speedup = vf(3, 0);
   for (auto shape : shapes) {
-    vf su = _main(shape.first, shape.second[0], shape.second[1],
-                  shape.second[2], shape.second[3], iteration, debug);
-    for (int i = 0; i < 3; ++i) speedup[i] += su[i];
+    _main(shape.first, shape.second[0], shape.second[1], shape.second[2],
+          shape.second[3], iteration, debug);
   }
 
-  printf(">>>>>>>>>>>>>>>>>>>> SUMMARY >>>>>>>>>>>>>>>>>>>>\n");
-  printf("AVERAGE SPEEDUP (cublas fp16 / lt fp16):     %.3f\n",
-         speedup[0] / shapes.size());
-  printf("AVERAGE SPEEDUP (cublas fp16 / cublas int8): %.3f\n",
-         speedup[1] / shapes.size());
-  printf("AVERAGE SPEEDUP (cublas fp16 / lt int8):     %.3f\n",
-         speedup[2] / shapes.size());
   return 0;
 }
diff --git a/tests/cublas/util.h b/tests/cublas/util.h
index 7018bfb7..c534036c 100644
--- a/tests/cublas/util.h
+++ b/tests/cublas/util.h
@@ -7,6 +7,8 @@ typedef std::pair<std::string, vi> psvi;
 typedef std::vector<psvi> vpsvi;
 typedef std::vector<float> vf;
 
+inline int round_up(int v, int d) { return (v + d - 1) / d * d; }
+
 inline void checkCudaStatus(cudaError_t status) {
   if (status != cudaSuccess) {
     printf("cuda API failed with status %d: %s\n", status,
@@ -76,38 +78,29 @@ void init_data(float *fX, __half *hX, int8_t *iX, float *fW, __half *hW,
   }
 }
 
-void print_res(float *Y, float *fY, __half *hY, int32_t *iY, int C, int B,
-               int O, int H, float ft, float ht, float it, bool debug) {
-  float fe = 0, he = 0, ie = 0;
+template <typename T>
+void print_res(float *oracle, T *res, float time, int C, int B, int O, int H,
+               std::string name, bool debug) {
+  float dequant_scale = 1.0;
+  if (std::is_same<T, int32_t>::value) {
+    dequant_scale /= (127 * 127);
+  } else if (std::is_same<T, int8_t>::value) {
+    dequant_scale /= (127 * 2.951 / H);
+  }
+  float e = 0;
   if (debug) {
     printf("oracle:\n");
-    for (int i = 0; i < 10; ++i) printf("%.5f%c", Y[i], " \n"[i == 9]);
+    for (int i = 0; i < 10; ++i) printf("%.5f%c", oracle[i], " \n"[i == 9]);
   }
 
-  printf("fp32:\n");
-  if (debug)
-    for (int i = 0; i < 10; ++i) printf("%.5f%c", fY[i], " \n"[i == 9]);
-  for (int i = 0; i < C * B * O; ++i)
-    fe += fabs((debug ? Y[i] : fY[i]) - fY[i]);
-  printf("  diff: %.5f\n", fe / C / B / O);
-  printf("  time: %.3f ms\n", ft);
-
-  printf("fp16:\n");
-  if (debug)
-    for (int i = 0; i < 10; ++i) printf("%.5f%c", float(hY[i]), " \n"[i == 9]);
-  for (int i = 0; i < C * B * O; ++i)
-    he += fabs((debug ? Y[i] : fY[i]) - float(hY[i]));
-  printf("  diff: %.5f\n", he / C / B / O);
-  printf("  time: %.3f ms\n", ht);
-
-  printf("int8:\n");
+  printf("%s:\n", name.c_str());
   if (debug)
     for (int i = 0; i < 10; ++i)
-      printf("%.5f%c", float(iY[i]) / 127 / 127, " \n"[i == 9]);
+      printf("%.5f%c", float(res[i]) * dequant_scale, " \n"[i == 9]);
   for (int i = 0; i < C * B * O; ++i)
-    ie += fabs((debug ? Y[i] : fY[i]) - float(iY[i]) / 127 / 127);
-  printf("  diff: %.5f\n", ie / C / B / O);
-  printf("  time: %.3f ms\n", it);
+    e += fabs(oracle[i] - float(res[i]) * dequant_scale);
+  printf("  diff: %.3f\n", e / (C * B * O));
+  printf("  time: %.3f ms\n", time);
 }
 
 void vec_pb(vpsvi &shapes, std::string name, vi shape) {