neuralmagic · dbogunowicz · Apr 2, 2024 · Apr 2, 2024 · Apr 3, 2024 · Apr 8, 2024
diff --git a/requirements-build.txt b/requirements-build.txt
@@ -4,4 +4,5 @@ ninja
 packaging
 setuptools>=49.4.0
 torch==2.2.1
+compressed-tensors
 wheel
diff --git a/tests/models/test_load_compressed_tensors_model.py b/tests/models/test_load_compressed_tensors_model.py
@@ -0,0 +1,61 @@
+"""Compare the outputs from identical models:
+    - one that is loaded from uncompressed safetensors
+    - one that is loaded form `compressed-tensors`.
+    The expectation is for the inference result in same
+    behavior
+"""
+from typing import Tuple
+
+import pytest
+from compare_utils import check_logprobs_close
+
+MODEL_MAX_LEN = 1024
+
+# pair of same models with compressed and ordinary safetensors
+MODELS = [(
+    "neuralmagic/llama2.c-stories110M-pruned50",  # uncompressed
+    "dtransposed/llama2.c-stories110M-pruned50-compressed-tensors"
+)  # compressed
+          ]
+
+
+@pytest.mark.parametrize("model_pair", MODELS)
+@pytest.mark.parametrize("dtype", ["float16"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [3])
+def test_models(
+    vllm_runner_nm,
+    example_prompts,
+    model_pair: Tuple[str, str],
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+
+    model_uncompressed, model_compressed = model_pair
+
+    vllm_model_0 = vllm_runner_nm(model_uncompressed,
+                                  dtype=dtype,
+                                  max_model_len=MODEL_MAX_LEN)
+
+    vllm_outputs_0 = vllm_model_0.generate_greedy_logprobs(
+        example_prompts, max_tokens, num_logprobs)
+
+    del vllm_model_0
+
+    vllm_model_1 = vllm_runner_nm(model_compressed,
+                                  dtype=dtype,
+                                  max_model_len=MODEL_MAX_LEN)
+
+    vllm_outputs_1 = vllm_model_1.generate_greedy_logprobs(
+        example_prompts, max_tokens, num_logprobs)
+
+    del vllm_model_1
+
+    # loop through the prompts
+    check_logprobs_close(
+        outputs_0_lst=vllm_outputs_0,
+        outputs_1_lst=vllm_outputs_1,
+        name_0="vllm_model_from_uncompressed_weights",
+        name_1="vllm_model_from_compressed_weights",
+    )
diff --git a/vllm/config.py b/vllm/config.py
@@ -2,9 +2,11 @@
 import json
 import os
 from dataclasses import dataclass, fields
-from typing import TYPE_CHECKING, ClassVar, Optional, Union
+from enum import Enum
+from typing import TYPE_CHECKING, Any, ClassVar, Dict, Optional, Union
 
 import torch
+from compressed_tensors import SPARSITY_CONFIG_NAME, CompressionConfig
 from packaging.version import Version
 from transformers import PretrainedConfig
 
@@ -21,6 +23,12 @@
 _GB = 1 << 30
 
 
+# UPSTREAM SYNC: keep sparsity
+class SparsityStructure(Enum):
+    sparse_w16a16 = "sparse_w16a16"
+    semi_structured_sparse_w16a16 = "semi_structured_sparse_w16a16"
+
+
 class ModelConfig:
     """Configuration for the model.
 
@@ -182,29 +190,71 @@ def _verify_tokenizer_mode(self) -> None:
 
     # UPSTREAM SYNC: keep sparsity
     def _verify_sparsity(self) -> None:
-        supported_sparsity = ["sparse_w16a16", "semi_structured_sparse_w16a16"]
+        supported_sparsity = {
+            sparsity_structure.value
+            for sparsity_structure in SparsityStructure
+        }
+
+        hf_sparsity_config = getattr(self.hf_config, SPARSITY_CONFIG_NAME,
+                                     None)
+        if hf_sparsity_config is not None:
+            sparsity_structure = self._sparsity_structure_from_config(
+                hf_sparsity_config, dtype=self.dtype)
+            if self.sparsity is not None:
+                logger.info("Overriding the sparsity structure "
+                            "inferred from the config: "
+                            f"{sparsity_structure} with: {self.sparsity}")
+            self.sparsity = self.sparsity or sparsity_structure
+            if (self.sparsity not in supported_sparsity) and \
+               (self.sparsity is not None):
+                raise ValueError(
+                    f"Unknown sparsity_structure: {self.sparsity}. Must "
+                    f"be one of {supported_sparsity}. Running the models "
+                    "without sparse kernels.")
 
         if self.quantization is not None and self.sparsity is not None:
             raise ValueError("Both sparsity and quantization detected. Only "
                              "one or the other is supported at a time.")
 
-        if (self.sparsity is not None
-                and self.sparsity not in supported_sparsity):
-            raise ValueError(f"Unknown sparse method: {self.sparsity}. Must "
-                             f"be one of {supported_sparsity}.")
+    @staticmethod
+    def _sparsity_structure_from_config(
+            sparsity_config: Dict[str, Any],
+            dtype: torch.dtype) -> SparsityStructure:
+        """
+        Translate from the sparsity_config to an appropriate sparsity structure.
+
+        :param sparsity_config: A dictionary specifying the sparsity config
+        :param dtype: The dtype of the model in question
+        :return The appropriate sparsity structure as string
+        """
+        supported_sparsity_dtypes = {torch.float16, torch.bfloat16}
+
+        # check the validity of sparsity_config
+        potentially_missing_keys = set(sparsity_config.keys()).difference(
+            CompressionConfig.model_fields.keys())
+        if potentially_missing_keys:
+            raise ValueError("The detected sparsity_config is "
+                             f"missing keys: {potentially_missing_keys}")
+
+        # check for valid dtype
+        if dtype not in supported_sparsity_dtypes:
+            logger.warning(
+                f"Sparsity is only supported for {supported_sparsity_dtypes}"
+                "dtypes. Running the models without sparse kernels.")
-                "dtypes. Running the models without sparse kernels.")
+                f"dtypes, not {dtype}. Running the models without sparse kernels.")
-                "dtypes. Running the models without sparse kernels.")
+                f"dtypes, not {dtype}. Running the models without sparse kernels.")
+            return None
+
+        # choose the sparsity structure based on the sparsity config
+        if sparsity_config["sparsity_structure"] in {"unstructured", "0:0"}:
+            return SparsityStructure.sparse_w16a16.value
 
-        hf_sparsity_config = getattr(self.hf_config, "sparsity_config", None)
-        if hf_sparsity_config is not None:
-            hf_sparsity_method = str(
-                hf_sparsity_config["sparse_method"]).lower()
-            if self.sparsity is None:
-                self.sparsity = hf_sparsity_method
-            elif self.sparsity != hf_sparsity_method:
-                raise ValueError(
-                    "Sparsity method specified in the model config "
-                    f"({hf_sparsity_method}) does not match the sparsity "
-                    f"method specified in the `sparsity` argument "
-                    f"({self.sparsity}).")
+        elif sparsity_config["sparsity_structure"] == "2:4":
+            return SparsityStructure.semi_structured_sparse_w16a16.value
+
+        # if the sparsity config is not recognized, return None
+        logger.warning("The valid sparsity structure cannot be inferred from "
+                       "the valid sparsity config:\n{sparsity_config}"
+                       "\n Running the models without sparse kernels.")
-                       "\n Running the models without sparse kernels.")
+                       "\nRunning the models without sparse kernels.")
-                       "\n Running the models without sparse kernels.")
+                       "\nRunning the models without sparse kernels.")
+        return None
 
     def _verify_quantization(self) -> None:
         supported_quantization = ["awq", "gptq", "squeezellm", "marlin"]

diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py
@@ -11,6 +11,8 @@
 import huggingface_hub.constants
 import numpy as np
 import torch
+from compressed_tensors import infer_compressor_from_model_config
+from compressed_tensors.config import CompressionFormat
 from huggingface_hub import HfFileSystem, snapshot_download
 from safetensors.torch import load_file, safe_open, save_file
 from tqdm.auto import tqdm
@@ -293,11 +295,23 @@ def hf_model_weights_iterator(
                 param = np.load(f)
             yield name, torch.from_numpy(param)
     elif use_safetensors:
-        for st_file in hf_weights_files:
-            with safe_open(st_file, framework="pt") as f:
-                for name in f.keys():  # noqa: SIM118
-                    param = f.get_tensor(name)
-                    yield name, param
+        # UPSTREAM SYNC: needed for loading compressed tensors
+        # (see neural-magic/compressed-tensors repository)
+        compressor = infer_compressor_from_model_config(hf_folder)
+        compression_format = compressor.config.format if compressor else None
+        if compressor is None or compression_format == CompressionFormat.dense_sparsity.value:  # noqa E501
+            for st_file in hf_weights_files:
+                with safe_open(st_file, framework="pt") as f:
+                    for name in f.keys():  # noqa: SIM118
+                        param = f.get_tensor(name)
+                        yield name, param
+        else:
+            # a non-trivial (not dense) compressor inferred,
+            # the models weights are compressed (sparse), so
+            # they need decompressing before loading
+            for name, param in compressor.decompress(hf_folder):
+                yield name, param
+
     else:
         for bin_file in hf_weights_files:
             state = torch.load(bin_file, map_location="cpu")