updated

neuralmagic · May 13, 2024 · 5a4198a · 5a4198a · github-actions · May 14, 2024
1 parent 03acaca
commit 5a4198a
Show file tree

Hide file tree

Showing 2 changed files with 1 addition and 2 deletions.
diff --git a/vllm/config.py b/vllm/config.py
@@ -251,7 +251,6 @@ def _verify_quantization(self) -> None:
                     "optimized yet. The speed can be slower than "
                     "non-quantized models.", self.quantization)
 
-
     def _verify_cuda_graph(self) -> None:
         if self.max_seq_len_to_capture is None:
             self.max_seq_len_to_capture = self.max_model_len

diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
@@ -18,10 +18,10 @@
     "awq": AWQConfig,
     "fp8": Fp8Config,
     "gptq": GPTQConfig,
-    "squeezellm": SqueezeLLMConfig,
     "gptq_marlin": GPTQMarlinConfig,
     "gptq_marlin_24": GPTQMarlin24Config,
     "marlin": MarlinConfig,
+    "squeezellm": SqueezeLLMConfig,
 }
Benchmark suite	Current: `5a4198a`	Previous: `df1f1a0`	Ratio
`{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.2.0", "python_version": "3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]", "torch_version": "2.3.0+cu121"}`	`3.8369451773150796` prompts/s
`{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.2.0", "python_version": "3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]", "torch_version": "2.3.0+cu121"}`	`1473.3869480889905` tokens/s