neuralmagic · afeldman-nm · Feb 18, 2024 · Feb 29, 2024 · Feb 29, 2024 · Mar 1, 2024
diff --git a/.clang-format b/.clang-format
@@ -0,0 +1,4 @@
+# Use the Google style in this project.
+BasedOnStyle: Google
+
+ColumnLimit: 120
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
@@ -110,7 +110,8 @@ async def async_request_vllm(
                             output.ttft = ttft
                     output.latency = time.perf_counter() - st
 
-                    # When streaming, '\0' is appended to the end of response.
+                    # When streaming, '\0' is appended
+                    # to the end of the response.
                     body = data.decode("utf-8").strip("\0")
                     output.generated_text = json.loads(
                         body)["text"][0][len(request_func_input.prompt):]
@@ -192,8 +193,8 @@ async def async_request_deepspeed_mii(
         output = RequestFuncOutput()
         output.prompt_len = request_func_input.prompt_len
 
-        # DeepSpeed-MII doesn't support streaming as of Jan 28 2024,
-        # will use 0 as placeholder.
+        # DeepSpeed-MII doesn't support streaming
+        # as of Jan 28 2024, will use 0 as placeholder.
         # https://github.com/microsoft/DeepSpeed-MII/pull/311
         output.ttft = 0
 

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
@@ -293,9 +293,8 @@ def main(args: argparse.Namespace):
 
         # Save to file
         base_model_id = model_id.split("/")[-1]
-        file_name = (
-            f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
-        )
+        file_name = f"{backend}-{args.request_rate}qps-" \
+                    f"{base_model_id}-{current_dt}.json"
         with open(file_name, "w") as outfile:
             json.dump(result_json, outfile)
 
@@ -343,7 +342,8 @@ def main(args: argparse.Namespace):
         "--tokenizer",
         type=str,
         help=
-        "Name or path of the tokenizer, if not using the default tokenizer.",
+        "Name or path of the tokenizer, if not " \
+        "using the default model tokenizer.",
     )
     parser.add_argument(
         "--best-of",

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
@@ -6,8 +6,8 @@
 from typing import List, Optional, Tuple
 
 import torch
-from transformers import (AutoModelForCausalLM, AutoTokenizer,
-                          PreTrainedTokenizerBase)
+from transformers import (AutoModelForCausalLM, T5ForConditionalGeneration,
+                          AutoTokenizer, PreTrainedTokenizerBase)
 from tqdm import tqdm
 
 
@@ -125,8 +125,16 @@ def run_hf(
     trust_remote_code: bool,
 ) -> float:
     assert not use_beam_search
-    llm = AutoModelForCausalLM.from_pretrained(
-        model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
+    if "t5" in model:
+        llm = T5ForConditionalGeneration.from_pretrained(
+            model,
+            torch_dtype=torch.float16,
+            trust_remote_code=trust_remote_code)
+    else:
+        llm = AutoModelForCausalLM.from_pretrained(
+            model,
+            torch_dtype=torch.float16,
+            trust_remote_code=trust_remote_code)
     if llm.config.model_type == "llama":
         # To enable padding in the HF backend.
         tokenizer.pad_token = tokenizer.eos_token

diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu
@@ -1,5 +1,6 @@
 /*
- * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
  * Copyright (c) 2023, The vLLM team.
  * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
  *
@@ -16,9 +17,10 @@
  * limitations under the License.
  */
 
-#include <torch/extension.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
+#include <stdio.h>
+#include <torch/extension.h>
 
 #include "attention_dtypes.h"
 #include "attention_utils.cuh"
@@ -30,12 +32,12 @@
 
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
-#define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
+#define DIVIDE_ROUND_UP(a, b) (((a) + (b)-1) / (b))
 
 namespace vllm {
 
 // Utility function for attention softmax.
-template<int NUM_WARPS>
+template <int NUM_WARPS>
 inline __device__ float block_sum(float* red_smem, float sum) {
   // Decompose the thread index into warp / lane.
   int warp = threadIdx.x / WARP_SIZE;
@@ -93,6 +95,7 @@ __device__ void paged_attention_kernel(
   const int* __restrict__ context_lens,   // [num_seqs]
   const int max_num_blocks_per_seq,
   const float* __restrict__ alibi_slopes, // [num_heads]
+  const float* __restrict__ custom_bias,   // [num_seqs, num_heads, 1, max_seq_len]
   const int q_stride,
   const int kv_block_stride,
   const int kv_head_stride) {
@@ -133,6 +136,10 @@ __device__ void paged_attention_kernel(
   const int num_queries_per_kv = num_heads / num_kv_heads;
   const int kv_head_idx = head_idx / num_queries_per_kv;
   const float alibi_slope = alibi_slopes == nullptr ? 0.f : alibi_slopes[head_idx];
+  const float* custom_bias_vec = custom_bias == nullptr
+                                     ? nullptr
+                                     : custom_bias + seq_idx * num_kv_heads * num_context_blocks * BLOCK_SIZE +
+                                           kv_head_idx * num_context_blocks * BLOCK_SIZE;
 
   // A vector type to store a part of a key or a query.
   // The vector size is configured in such a way that the threads in a thread group
@@ -224,8 +231,10 @@ __device__ void paged_attention_kernel(
       // Compute dot product.
       // This includes a reduction across the threads in the same thread group.
       float qk = scale * Qk_dot<scalar_t, THREAD_GROUP_SIZE>::dot(q_vecs[thread_group_offset], k_vecs);
-      // Add the ALiBi bias if slopes are given.
-      qk += (alibi_slope != 0) ? alibi_slope * (token_idx - context_len + 1) : 0;
+      // Add the custom or ALiBi bias if given.
+      qk += (custom_bias_vec != nullptr) ? custom_bias_vec[token_idx]
+            : (alibi_slope != 0)         ? alibi_slope * (token_idx - context_len + 1)
+                                         : 0;
 
       if (thread_group_offset == 0) {
         // Store the partial reductions to shared memory.
@@ -435,13 +444,14 @@ __global__ void paged_attention_v1_kernel(
   const int* __restrict__ context_lens,   // [num_seqs]
   const int max_num_blocks_per_seq,
   const float* __restrict__ alibi_slopes, // [num_heads]
+  const float* __restrict__ custom_bias,   // [num_seqs, num_heads, 1, seq_len]
   const int q_stride,
   const int kv_block_stride,
   const int kv_head_stride) {
   paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS, IS_FP8_E5M2_KV_CACHE>(
     /* exp_sums */ nullptr, /* max_logits */ nullptr,
     out, q, k_cache, v_cache, num_kv_heads, scale, block_tables, context_lens,
-    max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride, kv_head_stride);
+    max_num_blocks_per_seq, alibi_slopes, custom_bias, q_stride, kv_block_stride, kv_head_stride);
 }
 
 // Grid: (num_heads, num_seqs, max_num_partitions).
@@ -466,13 +476,14 @@ __global__ void paged_attention_v2_kernel(
   const int* __restrict__ context_lens,   // [num_seqs]
   const int max_num_blocks_per_seq,
   const float* __restrict__ alibi_slopes, // [num_heads]
+  const float* __restrict__ custom_bias,   // [num_seqs, num_heads, 1, seq_len]
   const int q_stride,
   const int kv_block_stride,
   const int kv_head_stride) {
   paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS, IS_FP8_E5M2_KV_CACHE, PARTITION_SIZE>(
     exp_sums, max_logits, tmp_out, q, k_cache, v_cache, num_kv_heads, scale,
     block_tables, context_lens, max_num_blocks_per_seq, alibi_slopes,
-    q_stride, kv_block_stride, kv_head_stride);
+    custom_bias, q_stride, kv_block_stride, kv_head_stride);
 }
 
 // Grid: (num_heads, num_seqs).
@@ -592,6 +603,7 @@ __global__ void paged_attention_v2_reduce_kernel(
     context_lens_ptr,                                                                         \
     max_num_blocks_per_seq,                                                                   \
     alibi_slopes_ptr,                                                                         \
+    custom_bias_ptr,                                                                          \
     q_stride,                                                                                 \
     kv_block_stride,                                                                          \
     kv_head_stride);
@@ -613,7 +625,8 @@ void paged_attention_v1_launcher(
   torch::Tensor& block_tables,
   torch::Tensor& context_lens,
   int max_context_len,
-  const c10::optional<torch::Tensor>& alibi_slopes) {
+  const c10::optional<torch::Tensor>& alibi_slopes,
+  const c10::optional<torch::Tensor>& custom_bias) {
   int num_seqs = query.size(0);
   int num_heads = query.size(1);
   int head_size = query.size(2);
@@ -626,9 +639,11 @@ void paged_attention_v1_launcher(
   assert(head_size % thread_group_size == 0);
 
   // NOTE: alibi_slopes is optional.
-  const float* alibi_slopes_ptr = alibi_slopes ?
-    reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
-    : nullptr;
+  const float* alibi_slopes_ptr =
+      alibi_slopes ? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr()) : nullptr;
+
+  // NOTE: alibi_slopes is optional.
+  const float* custom_bias_ptr = custom_bias ? reinterpret_cast<const float*>(custom_bias.value().data_ptr()) : nullptr;
 
   T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
   T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
@@ -688,7 +703,8 @@ void paged_attention_v1_launcher(
     block_tables,                                                            \
     context_lens,                                                            \
     max_context_len,                                                         \
-    alibi_slopes);
+    alibi_slopes,                                                            \
+    custom_bias);
 
 // NOTE(woosuk): To reduce the compilation time, we omitted block sizes
 // 1, 2, 4, 64, 128, 256.
@@ -720,6 +736,7 @@ void paged_attention_v1(
   int block_size,
   int max_context_len,
   const c10::optional<torch::Tensor>& alibi_slopes,
+  const c10::optional<torch::Tensor>& custom_bias,
   const std::string& kv_cache_dtype) {
   if (kv_cache_dtype == "auto") {
     if (query.dtype() == at::ScalarType::Float) {
@@ -762,6 +779,7 @@ void paged_attention_v1(
     context_lens_ptr,                                                                         \
     max_num_blocks_per_seq,                                                                   \
     alibi_slopes_ptr,                                                                         \
+    custom_bias_ptr,                                                                          \
     q_stride,                                                                                 \
     kv_block_stride,                                                                          \
     kv_head_stride);                                                                          \
@@ -794,7 +812,8 @@ void paged_attention_v2_launcher(
   torch::Tensor& block_tables,
   torch::Tensor& context_lens,
   int max_context_len,
-  const c10::optional<torch::Tensor>& alibi_slopes) {
+  const c10::optional<torch::Tensor>& alibi_slopes,
+  const c10::optional<torch::Tensor>& custom_bias) {
   int num_seqs = query.size(0);
   int num_heads = query.size(1);
   int head_size = query.size(2);
@@ -807,9 +826,10 @@ void paged_attention_v2_launcher(
   assert(head_size % thread_group_size == 0);
 
   // NOTE: alibi_slopes is optional.
-  const float* alibi_slopes_ptr = alibi_slopes ?
-    reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
-    : nullptr;
+  const float* alibi_slopes_ptr =
+      alibi_slopes ? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr()) : nullptr;
+
+  const float* custom_bias_ptr = custom_bias ? reinterpret_cast<const float*>(custom_bias.value().data_ptr()) : nullptr;
 
   T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
   float* exp_sums_ptr = reinterpret_cast<float*>(exp_sums.data_ptr());
@@ -878,7 +898,8 @@ void paged_attention_v2_launcher(
     block_tables,                                                                \
     context_lens,                                                                \
     max_context_len,                                                             \
-    alibi_slopes);
+    alibi_slopes,                                                                \
+    custom_bias);
 
 // NOTE(woosuk): To reduce the compilation time, we omitted block sizes
 // 1, 2, 4, 64, 128, 256.
@@ -913,6 +934,7 @@ void paged_attention_v2(
   int block_size,
   int max_context_len,
   const c10::optional<torch::Tensor>& alibi_slopes,
+  const c10::optional<torch::Tensor>& custom_bias,
   const std::string& kv_cache_dtype) {
   if (kv_cache_dtype == "auto") {
     if (query.dtype() == at::ScalarType::Float) {

diff --git a/csrc/ops.h b/csrc/ops.h
@@ -14,6 +14,7 @@ void paged_attention_v1(
   int block_size,
   int max_context_len,
   const c10::optional<torch::Tensor>& alibi_slopes,
+  const c10::optional<torch::Tensor>& custom_bias,
   const std::string& kv_cache_dtype);
 
 void paged_attention_v2(
@@ -31,6 +32,7 @@ void paged_attention_v2(
   int block_size,
   int max_context_len,
   const c10::optional<torch::Tensor>& alibi_slopes,
+  const c10::optional<torch::Tensor>& custom_bias,
   const std::string& kv_cache_dtype);
 
 void rms_norm(

diff --git a/examples/offline_inference_enc_dec.py b/examples/offline_inference_enc_dec.py
@@ -0,0 +1,94 @@
+'''
+Affirm T5 model outputs match between vLLM and native PyTorch
+
+Scenarios:
+* t5-small, t5-large
+* float16, float32, bfloat16, bfloat32
+* Custom prompts & num. prompts
+
+Output: for several prompts, compare native PyTorch & vLLM prompt completions
+'''
+import warnings
+import torch
+from vllm import LLM, SamplingParams
+from transformers import T5Tokenizer, T5ForConditionalGeneration
+
+warnings.filterwarnings("ignore",
+                        category=UserWarning,
+                        module="transformers.generation.utils.*")
+
+hf_model_id = "google/flan-t5-small" # t5-small
+dtype = "float32"
+prompts = [
+    #"Who are you?",
+    #"Who are you?",
+    #"How do",
+    #"Who aren't you?",
+    #"Who aren't you?<pad><pad><pad><pad>", # 
+    "Who are you? Write a very long response.",
+]
+
+dtype_obj = getattr(torch, dtype)
+
+# Native PyTorch test
+
+# - Model and tokenizer initialization
+tokenizer = T5Tokenizer.from_pretrained(hf_model_id, legacy=False)
+model:T5ForConditionalGeneration  = T5ForConditionalGeneration.from_pretrained(hf_model_id).to(
+    dtype=dtype_obj)
+
+# - Assume 'dtype' is already defined, e.g., dtype=torch.float32
+# - Tokenizing the prompts list with specified data type
+input_ids = tokenizer(prompts,
+                      return_tensors="pt",
+                      padding=True,
+                      truncation=True).input_ids
+
+# - If using GPU, also send input_ids to the same device as the model
+if torch.cuda.is_available():
+    model = model.cuda()  # Move model to GPU
+    input_ids = input_ids.cuda()  # Move input_ids to GPU
+
+# - Max token count for both native and vLLM test
+max_tokens = 512
+
+# - Generating outputs for all tokenized prompts
+native_outputs = model.generate(input_ids,max_length = max_tokens).cpu()
+
+# vLLM test
+model: LLM = LLM(hf_model_id,
+            enforce_eager=True,
+            dtype=dtype,
+            gpu_memory_utilization=0.5)
+
+sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0)
+
+vllm_outputs = model.generate(
+    prompts,
+    sampling_params=sampling_params
+)
+
+print(native_outputs)
+print(vllm_outputs)
+
+# Print native & vLLM outputs
+i = 0
+for native_output, vllm_output in zip(native_outputs, vllm_outputs):
+    print(f"Prompt {i}:")
+
+    prompt = prompts[i]  # Get the corresponding prompt for this output
+    native_generated_text = tokenizer.decode(
+        native_output, skip_special_tokens=True)  # Decode the generated text
+    vllm_generated_text = vllm_output.outputs[0].text
+    print(
+        f"- Prompt: {prompt!r}, Native PyTorch generated text: " \
+        f"{native_generated_text!r}, " \
+        f"vLLM generated text: {vllm_generated_text!r}"
+    )
+
+    print("- Asserting textual match")
+    #assert native_generated_text == vllm_generated_text
+    print("- Asserting token match")
+    #assert native_output[1:-1].tolist() == vllm_output.outputs[0].token_ids[:-1]
+
+    i += 1