Skip to content

Commit

Permalink
add auto igemm for gpt, vit (#408)
Browse files Browse the repository at this point in the history
  • Loading branch information
godweiyang committed Oct 24, 2022
1 parent 7c03c4d commit b665742
Show file tree
Hide file tree
Showing 11 changed files with 202 additions and 86 deletions.
Expand Up @@ -45,6 +45,7 @@ def extract_gpt_weights(
eos_id=50256,
pad_id=50257,
max_step=50,
extra_decode_length=0,
):
# load var names
with open(os.path.join(os.path.dirname(model_dir), "config.json")) as f:
Expand Down Expand Up @@ -121,6 +122,9 @@ def extract_gpt_weights(
hdf5_file.create_dataset("model_conf/topp", data=topp, dtype="f4")
hdf5_file.create_dataset("model_conf/topk", data=topk, dtype="i4")
hdf5_file.create_dataset("model_conf/eos_id", data=eos_id, dtype="i4")
hdf5_file.create_dataset(
"model_conf/extra_decode_length", data=extra_decode_length, dtype="i4"
)

hdf5_file.close()
# read-in again to double check
Expand Down Expand Up @@ -150,6 +154,7 @@ def _print_pair(key, value):
eos_id = 50256
pad_id = 50257
max_step = 50
extra_decode_length = 0 # use positive length to avtivate it
extract_gpt_weights(
hdf5_path,
args.model,
Expand All @@ -159,4 +164,5 @@ def _print_pair(key, value):
eos_id=eos_id,
pad_id=pad_id,
max_step=max_step,
extra_decode_length=extra_decode_length,
)
4 changes: 3 additions & 1 deletion lightseq/csrc/kernels/includes/cublas_algo_map.h
Expand Up @@ -19,7 +19,9 @@
#define STRIDE 32
#define BORDER 512

static std::string DEFAULT_URL = "https://zenodo.org/record/7219754/files/";
static std::string DEFAULT_URL =
"http://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/lightseq/"
"igemm_configs/";
static std::string DEFAULT_DIR =
std::string(std::getenv("HOME")) + "/.lightseq/igemm_configs/";
static std::string IGEMM_T4_CONFIG = "igemm_T4.cfg";
Expand Down
4 changes: 3 additions & 1 deletion lightseq/inference/model/cublas_algo_map.h
Expand Up @@ -21,7 +21,9 @@ namespace cuda {
#define STRIDE 32
#define BORDER 512

static std::string DEFAULT_URL = "https://zenodo.org/record/7219754/files/";
static std::string DEFAULT_URL =
"http://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/lightseq/"
"igemm_configs/";
static std::string DEFAULT_DIR =
std::string(std::getenv("HOME")) + "/.lightseq/igemm_configs/";
static std::string IGEMM_T4_CONFIG = "igemm_T4.cfg";
Expand Down
9 changes: 4 additions & 5 deletions lightseq/inference/model/gpt_encoder.cc.cu
Expand Up @@ -248,15 +248,15 @@ int GptEncoder<OpType_>::run_one_sample(int batch_size, int batch_seq_len) {
ker_norm_layer_launcher<_DataType>(
_batch_token_num, _tw._hidden_size, _stream, _p_d_query,
_p_d_src_emb_wei[2], _p_d_src_emb_wei[3], _max_thread_per_block);
if (sample_one_token() == 0 || _batch_seq_len >= _tw._max_step) {
if (sample_one_token() == 0 || _batch_seq_len >= _batch_max_seq_len) {
CHECK_GPU_ERROR(cudaMemcpyAsync(_p_d_sample_id_buf, _p_d_sample_id,
_batch_token_num * sizeof(int),
cudaMemcpyDeviceToDevice, _stream));
CHECK_GPU_ERROR(cudaStreamSynchronize(_stream));
return _batch_seq_len;
}

while (_batch_seq_len < _tw._max_step) {
while (_batch_seq_len < _batch_max_seq_len) {
#ifdef DEBUG_RESULT
std::cout << "before sample:batch_size-" << _batch_size << " batch_seq_len-"
<< _batch_seq_len << std::endl;
Expand All @@ -282,14 +282,13 @@ int GptEncoder<OpType_>::run_one_sample(int batch_size, int batch_seq_len) {
ker_norm_layer_launcher<_DataType>(
_batch_size, _tw._hidden_size, _stream, _p_d_query, _p_d_src_emb_wei[2],
_p_d_src_emb_wei[3], _max_thread_per_block);

#ifdef DEBUG_RESULT
print_vec(_p_d_query, "_p_d_query before logits",
_batch_size * _tw._hidden_size - 10,
_batch_size * _tw._hidden_size);

if (sample_one_token_with_cache() == 0 || _batch_seq_len >= _tw._max_step)
break;
#else

bool unfinish = sample_one_token_with_cache();
if (!unfinish && !_is_benchmark) break;
#endif
Expand Down
133 changes: 91 additions & 42 deletions lightseq/inference/model/quant_gpt_encoder.cc.cu
Expand Up @@ -47,6 +47,7 @@ QuantGptEncoder<OpType_>::QuantGptEncoder(
_h_sample_id(max_batch_size * tw._max_step, 0),
_h_unfinished(1),
_is_benchmark(false),
_algo_map(),
_sm_gt_eq_80(getSMVersion() >= 80 ? true : false) {
CHECK_GPU_ERROR(cublasLtCreate(&_cublas_lt_handle));
}
Expand Down Expand Up @@ -179,11 +180,13 @@ void QuantGptEncoder<OpType_>::init_buffer() {
_p_device_wei.push_back(
to_gpu(_p_d_enc_wei[_weight_offset + 11], _tw._hidden_size, _stream));

auto weight_layout = _sm_gt_eq_80 ? kColMajor : kColMajor32;

quantize_weight(_p_d_enc_wei[_weight_offset + 2],
_int8_p_d_enc_wei[_layer_id * 4], _tw._hidden_size,
_tw._hidden_size * 3,
_quant_range / _enc_clip_max[_layer_id * 12], _stream,
_cublas_lt_handle);
_cublas_lt_handle, weight_layout);

quantize_weight(_p_d_enc_wei[_weight_offset + 4],
_int8_p_d_enc_wei[_layer_id * 4 + 1], _tw._hidden_size,
Expand All @@ -195,7 +198,7 @@ void QuantGptEncoder<OpType_>::init_buffer() {
_int8_p_d_enc_wei[_layer_id * 4 + 2], _tw._hidden_size,
_tw._inner_size,
_quant_range / _enc_clip_max[_layer_id * 12 + 2], _stream,
_cublas_lt_handle);
_cublas_lt_handle, weight_layout);

quantize_weight(_p_d_enc_wei[_weight_offset + 10],
_int8_p_d_enc_wei[_layer_id * 4 + 3], _tw._inner_size,
Expand Down Expand Up @@ -306,6 +309,8 @@ int QuantGptEncoder<OpType_>::run_one_sample(int batch_size,
_batch_size = batch_size;
_batch_seq_len = batch_seq_len;
_batch_token_num = batch_size * batch_seq_len;
_batch_max_seq_len =
min(_tw._max_step, batch_seq_len + _tw._extra_decode_length);

CHECK_GPU_ERROR(cudaMemcpyAsync(_p_d_real_seq_len, _h_real_seq_len.data(),
sizeof(int) * _batch_size,
Expand Down Expand Up @@ -345,15 +350,15 @@ int QuantGptEncoder<OpType_>::run_one_sample(int batch_size,
_p_d_self_v_cache2 = _p_d_self_v_cache1;
_p_d_self_v_cache1 = ftmp;

if (sample_one_token() == 0 || _batch_seq_len >= _tw._max_step) {
if (sample_one_token() == 0 || _batch_seq_len >= _batch_max_seq_len) {
CHECK_GPU_ERROR(cudaMemcpyAsync(_p_d_sample_id_buf, _p_d_sample_id,
_batch_token_num * sizeof(int),
cudaMemcpyDeviceToDevice, _stream));
CHECK_GPU_ERROR(cudaStreamSynchronize(_stream));
return _batch_seq_len;
}

while (_batch_seq_len < _tw._max_step) {
while (_batch_seq_len < _batch_max_seq_len) {
#ifdef DEBUG_RESULT
std::cout << "before sample:batch_size-" << _batch_size << " batch_seq_len-"
<< _batch_seq_len << std::endl;
Expand Down Expand Up @@ -485,16 +490,25 @@ void QuantGptEncoder<OpType_>::self_attention() {
_int8_ffn_in_buf, _p_device_wei[_weight_offset],
_p_device_wei[_weight_offset + 1], _p_device_wei[_weight_offset + 5],
_max_thread_per_block, _quant_range / _enc_clip_max[_layer_id * 12 + 4],
false, true);
false, !_sm_gt_eq_80);
}

cublasLtMM_withAlgo_i8IO(
_int8_ffn_out_buf, 1, _batch_token_num, _tw._hidden_size * 3,
_tw._hidden_size, 0, 0, 0,
_enc_clip_max[_layer_id * 12] * _enc_clip_max[_layer_id * 12 + 4] /
(_enc_clip_max[_layer_id * 12 + 8] * _quant_range),
_int8_ffn_in_buf, _int8_p_d_enc_wei[_layer_id * 4], _cublas_lt_handle,
_stream, _sm_gt_eq_80);
if (_sm_gt_eq_80) {
cublaslt_gemm(
_int8_p_d_enc_wei[_layer_id * 4], _int8_ffn_in_buf, _int8_ffn_out_buf,
1, _tw._hidden_size * 3, _batch_token_num, _tw._hidden_size, 0, 0, 0,
_enc_clip_max[_layer_id * 12] * _enc_clip_max[_layer_id * 12 + 4] /
(_enc_clip_max[_layer_id * 12 + 8] * _quant_range),
_cublas_lt_handle, _stream, _algo_map);
} else {
cublasLtMM_withAlgo_i8IO(
_int8_ffn_out_buf, 1, _batch_token_num, _tw._hidden_size * 3,
_tw._hidden_size, 0, 0, 0,
_enc_clip_max[_layer_id * 12] * _enc_clip_max[_layer_id * 12 + 4] /
(_enc_clip_max[_layer_id * 12 + 8] * _quant_range),
_int8_ffn_in_buf, _int8_p_d_enc_wei[_layer_id * 4], _cublas_lt_handle,
_stream, _sm_gt_eq_80);
}

#ifdef DEBUG_RESULT
print_vec(_int8_ffn_in_buf, "attn qkv in", 20);
Expand All @@ -509,7 +523,7 @@ void QuantGptEncoder<OpType_>::self_attention() {
_p_d_self_k_cache1[_layer_id], _p_d_self_v_cache1[_layer_id], _p_d_v,
_batch_seq_len, _tw._dim_per_head, _tw._head_num, _max_thread_per_block,
_enc_clip_max[_layer_id * 12 + 8] / _quant_range,
_quant_range / _enc_clip_max[_layer_id * 12 + 11], true);
_quant_range / _enc_clip_max[_layer_id * 12 + 11], !_sm_gt_eq_80);

/* ---step 2. correlation = q * k, perform softmax on correlation--- */
CHECK_GPU_ERROR(cublasGemmStridedBatchedEx(
Expand Down Expand Up @@ -563,7 +577,7 @@ void QuantGptEncoder<OpType_>::self_attention() {
_int8_ffn_in_buf, _p_d_query, _batch_token_num, _tw._hidden_size,
_enc_clip_max[_layer_id * 12 + 9] / _quant_range,
_quant_range / _enc_clip_max[_layer_id * 12 + 6], _max_thread_per_block,
_stream, false, false, true);
_stream, false, false, !_sm_gt_eq_80);

return;
}
Expand All @@ -576,18 +590,27 @@ void QuantGptEncoder<OpType_>::self_attention_with_cache() {
_batch_size, _tw._hidden_size, _stream, _p_d_query, _int8_ffn_in_buf,
_p_device_wei[_weight_offset], _p_device_wei[_weight_offset + 1],
_p_device_wei[_weight_offset + 5], _max_thread_per_block,
_quant_range / _enc_clip_max[_layer_id * 12 + 4], false, true);
_quant_range / _enc_clip_max[_layer_id * 12 + 4], false, !_sm_gt_eq_80);
}

/* ---step 1. qkv = ori_q * qkv_wei + bias, and reshape qkv for multi-head
* gemm--- */
cublasLtMM_withAlgo_i8IO(
_int8_ffn_out_buf, 1, _batch_size, _tw._hidden_size * 3, _tw._hidden_size,
0, 0, 0,
_enc_clip_max[_layer_id * 12] * _enc_clip_max[_layer_id * 12 + 4] /
(_enc_clip_max[_layer_id * 12 + 8] * _quant_range),
_int8_ffn_in_buf, _int8_p_d_enc_wei[_layer_id * 4], _cublas_lt_handle,
_stream, _sm_gt_eq_80);
if (_sm_gt_eq_80) {
cublaslt_gemm(
_int8_p_d_enc_wei[_layer_id * 4], _int8_ffn_in_buf, _int8_ffn_out_buf,
1, _tw._hidden_size * 3, _batch_size, _tw._hidden_size, 0, 0, 0,
_enc_clip_max[_layer_id * 12] * _enc_clip_max[_layer_id * 12 + 4] /
(_enc_clip_max[_layer_id * 12 + 8] * _quant_range),
_cublas_lt_handle, _stream, _algo_map);
} else {
cublasLtMM_withAlgo_i8IO(
_int8_ffn_out_buf, 1, _batch_size, _tw._hidden_size * 3,
_tw._hidden_size, 0, 0, 0,
_enc_clip_max[_layer_id * 12] * _enc_clip_max[_layer_id * 12 + 4] /
(_enc_clip_max[_layer_id * 12 + 8] * _quant_range),
_int8_ffn_in_buf, _int8_p_d_enc_wei[_layer_id * 4], _cublas_lt_handle,
_stream, _sm_gt_eq_80);
}

// get q, k, v by split and reshape qkv
ker_arrange_qkv_with_cache_i8I_i8O_launcher<_DataType>(
Expand All @@ -597,7 +620,7 @@ void QuantGptEncoder<OpType_>::self_attention_with_cache() {
_p_d_self_v_cache1[_layer_id], _p_d_self_v_cache2[_layer_id],
_batch_seq_len, _tw._dim_per_head, _tw._head_num,
_enc_clip_max[_layer_id * 12 + 8] / _quant_range,
_quant_range / _enc_clip_max[_layer_id * 12 + 11], true);
_quant_range / _enc_clip_max[_layer_id * 12 + 11], !_sm_gt_eq_80);

/* ---step 2. correlation = q * k, perform softmax on correlation
correlation: [batch_size, heads_num, 1, batch_seq_len]--- */
Expand Down Expand Up @@ -630,20 +653,30 @@ void QuantGptEncoder<OpType_>::self_attention_with_cache() {
_int8_ffn_in_buf, _p_d_query, _batch_size, _tw._hidden_size,
_enc_clip_max[_layer_id * 12 + 9] / _quant_range,
_quant_range / _enc_clip_max[_layer_id * 12 + 6], _max_thread_per_block,
_stream, false, false, true);
_stream, false, false, !_sm_gt_eq_80);
return;
}

template <OperationType OpType_>
void QuantGptEncoder<OpType_>::ffn_add_norm() {
/* ---step 1. first ffn layer--- */
cublasLtMM_withAlgo_i8IO(
_int8_ffn_out_buf, 1, _batch_token_num, _tw._inner_size, _tw._hidden_size,
0, 0, 0,
_enc_clip_max[_layer_id * 12 + 2] * _enc_clip_max[_layer_id * 12 + 6] /
(_enc_clip_max[_layer_id * 12 + 10] * _quant_range),
_int8_ffn_in_buf, _int8_p_d_enc_wei[_layer_id * 4 + 2], _cublas_lt_handle,
_stream, _sm_gt_eq_80);
if (_sm_gt_eq_80) {
cublaslt_gemm(_int8_p_d_enc_wei[_layer_id * 4 + 2], _int8_ffn_in_buf,
_int8_ffn_out_buf, 1, _tw._inner_size, _batch_token_num,
_tw._hidden_size, 0, 0, 0,
_enc_clip_max[_layer_id * 12 + 2] *
_enc_clip_max[_layer_id * 12 + 6] /
(_enc_clip_max[_layer_id * 12 + 10] * _quant_range),
_cublas_lt_handle, _stream, _algo_map);
} else {
cublasLtMM_withAlgo_i8IO(
_int8_ffn_out_buf, 1, _batch_token_num, _tw._inner_size,
_tw._hidden_size, 0, 0, 0,
_enc_clip_max[_layer_id * 12 + 2] * _enc_clip_max[_layer_id * 12 + 6] /
(_enc_clip_max[_layer_id * 12 + 10] * _quant_range),
_int8_ffn_in_buf, _int8_p_d_enc_wei[_layer_id * 4 + 2],
_cublas_lt_handle, _stream, _sm_gt_eq_80);
}

#ifdef DEBUG_RESULT
print_vec(_int8_ffn_in_buf, "ffn1 in", 20);
Expand All @@ -655,7 +688,7 @@ void QuantGptEncoder<OpType_>::ffn_add_norm() {
_batch_token_num, _stream, _int8_ffn_out_buf, _int8_ffn_in_buf,
_p_device_wei[_weight_offset + 9], _tw._inner_size,
_enc_clip_max[_layer_id * 12 + 10] / _quant_range,
_quant_range / _enc_clip_max[_layer_id * 12 + 7], true, false);
_quant_range / _enc_clip_max[_layer_id * 12 + 7], !_sm_gt_eq_80, false);

/* ---step 2. second ffn layer--- */
cublaslt_gemm(_int8_p_d_enc_wei[_layer_id * 4 + 3], _int8_ffn_in_buf,
Expand All @@ -670,6 +703,7 @@ void QuantGptEncoder<OpType_>::ffn_add_norm() {

const _DataType *scale_ptr, *bias_ptr, *res_bias_ptr;
float clip_max, dequant_scale;
bool use_col32;
dequant_scale = _enc_clip_max[_layer_id * 12 + 3] *
_enc_clip_max[_layer_id * 12 + 7] /
(_quant_range * _quant_range);
Expand All @@ -678,39 +712,51 @@ void QuantGptEncoder<OpType_>::ffn_add_norm() {
bias_ptr = _p_device_emb[3];
res_bias_ptr = nullptr;
clip_max = _output_ln_clip_max;
use_col32 = true;
} else {
scale_ptr = _p_device_wei[(_layer_id + 1) * _tw._weight_per_enc_layer];
bias_ptr = _p_device_wei[(_layer_id + 1) * _tw._weight_per_enc_layer + 1];
res_bias_ptr =
_p_device_wei[(_layer_id + 1) * _tw._weight_per_enc_layer + 5];
clip_max = _enc_clip_max[(_layer_id + 1) * 12 + 4];
use_col32 = !_sm_gt_eq_80;
}

ker_residual_bias_ln_i32I_i8O_launcher<_DataType>(
_int32_ffn_out_buf, scale_ptr, bias_ptr, res_bias_ptr, _int8_ffn_in_buf,
_p_d_query, _batch_token_num, _tw._hidden_size, dequant_scale,
_quant_range / clip_max, _max_thread_per_block, _stream, false, false,
true, _scaled_ffn2_colsum[_layer_id]);
use_col32, _scaled_ffn2_colsum[_layer_id]);

return;
}

template <OperationType OpType_>
void QuantGptEncoder<OpType_>::ffn_add_norm_with_cache() {
/* ---step 1. first ffn layer--- */
cublasLtMM_withAlgo_i8IO(
_int8_ffn_out_buf, 1, _batch_size, _tw._inner_size, _tw._hidden_size, 0,
0, 0,
_enc_clip_max[_layer_id * 12 + 2] * _enc_clip_max[_layer_id * 12 + 6] /
(_enc_clip_max[_layer_id * 12 + 10] * _quant_range),
_int8_ffn_in_buf, _int8_p_d_enc_wei[_layer_id * 4 + 2], _cublas_lt_handle,
_stream, _sm_gt_eq_80);
if (_sm_gt_eq_80) {
cublaslt_gemm(_int8_p_d_enc_wei[_layer_id * 4 + 2], _int8_ffn_in_buf,
_int8_ffn_out_buf, 1, _tw._inner_size, _batch_size,
_tw._hidden_size, 0, 0, 0,
_enc_clip_max[_layer_id * 12 + 2] *
_enc_clip_max[_layer_id * 12 + 6] /
(_enc_clip_max[_layer_id * 12 + 10] * _quant_range),
_cublas_lt_handle, _stream, _algo_map);
} else {
cublasLtMM_withAlgo_i8IO(
_int8_ffn_out_buf, 1, _batch_size, _tw._inner_size, _tw._hidden_size, 0,
0, 0,
_enc_clip_max[_layer_id * 12 + 2] * _enc_clip_max[_layer_id * 12 + 6] /
(_enc_clip_max[_layer_id * 12 + 10] * _quant_range),
_int8_ffn_in_buf, _int8_p_d_enc_wei[_layer_id * 4 + 2],
_cublas_lt_handle, _stream, _sm_gt_eq_80);
}

ker_bias_gelu_i8I_i8O_launcher<_DataType>(
_batch_size, _stream, _int8_ffn_out_buf, _int8_ffn_in_buf,
_p_device_wei[_weight_offset + 9], _tw._inner_size,
_enc_clip_max[_layer_id * 12 + 10] / _quant_range,
_quant_range / _enc_clip_max[_layer_id * 12 + 7], true, false);
_quant_range / _enc_clip_max[_layer_id * 12 + 7], !_sm_gt_eq_80, false);

/* ---step 2. second ffn layer--- */
cublaslt_gemm(_int8_p_d_enc_wei[_layer_id * 4 + 3], _int8_ffn_in_buf,
Expand All @@ -719,6 +765,7 @@ void QuantGptEncoder<OpType_>::ffn_add_norm_with_cache() {

const _DataType *scale_ptr, *bias_ptr, *res_bias_ptr;
float clip_max, dequant_scale;
bool use_col32;
dequant_scale = _enc_clip_max[_layer_id * 12 + 3] *
_enc_clip_max[_layer_id * 12 + 7] /
(_quant_range * _quant_range);
Expand All @@ -727,19 +774,21 @@ void QuantGptEncoder<OpType_>::ffn_add_norm_with_cache() {
bias_ptr = _p_device_emb[3];
res_bias_ptr = nullptr;
clip_max = _output_ln_clip_max;
use_col32 = true;
} else {
scale_ptr = _p_device_wei[(_layer_id + 1) * _tw._weight_per_enc_layer];
bias_ptr = _p_device_wei[(_layer_id + 1) * _tw._weight_per_enc_layer + 1];
res_bias_ptr =
_p_device_wei[(_layer_id + 1) * _tw._weight_per_enc_layer + 5];
clip_max = _enc_clip_max[(_layer_id + 1) * 12 + 4];
use_col32 = !_sm_gt_eq_80;
}

ker_residual_bias_ln_i32I_i8O_launcher<_DataType>(
_int32_ffn_out_buf, scale_ptr, bias_ptr, res_bias_ptr, _int8_ffn_in_buf,
_p_d_query, _batch_size, _tw._hidden_size, dequant_scale,
_quant_range / clip_max, _max_thread_per_block, _stream, false, false,
true, _scaled_ffn2_colsum[_layer_id]);
use_col32, _scaled_ffn2_colsum[_layer_id]);

return;
}
Expand Down

0 comments on commit b665742

Please sign in to comment.