neuralmagic · mgoin · Dec 27, 2023 · Dec 27, 2023 · Dec 27, 2023 · Dec 27, 2023
diff --git a/tests/deepsparse/pipelines/test_clip.py b/tests/deepsparse/pipelines/test_clip.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import numpy as np
+
 import pytest
 from deepsparse.clip import (
     CLIPCaptionInput,
@@ -30,24 +32,48 @@
 from tests.utils import mock_engine
 
 
+def custom_process_inputs(self, inputs):
+    if not isinstance(inputs.text, list):
+        inputs.text = [inputs.text]
+    if not isinstance(inputs.text[0], str):
+        return inputs.text
+    tokens = [np.array(t).astype(np.int32) for t in self.tokenizer(inputs.text)]
+    tokens = np.stack(tokens, axis=0)
+    tokens_lengths = np.array(tokens.shape[0] * [tokens.shape[1] - 1])
+    return [tokens, tokens_lengths]
+
+
+# This overrides the process_inputs function globally for all CLIPTextPipeline classes.
+# This is needed for CLIP-ViT-B-32-256x256-DataComp-s34B-b86K as it has a second input
+# that specifies how many tokens are present.
+CLIPTextPipeline.process_inputs = custom_process_inputs
+
+
 @pytest.fixture
-def visual_input():
+def model_folder():
+    from huggingface_hub import snapshot_download
+
+    model_id = "neuralmagic/CLIP-ViT-B-32-256x256-DataComp-s34B-b86K-quant-ds"
+    return snapshot_download(repo_id=model_id)
+
+
+@pytest.fixture
+def visual_input(model_folder):
+    model_path = model_folder + "/visual.onnx"
     images = computer_vision(batch_size=2)
-    model_path = None
     return CLIPVisualInput(images=images.get("images")), model_path
 
 
 @pytest.fixture
-def text_input():
-    model_path = None
+def text_input(model_folder):
+    model_path = model_folder + "/textual.onnx"
     text = ["a building", "a dog", "a cat"]
     return CLIPTextInput(text=text), model_path
 
 
-@pytest.mark.skip(reason="No CLIP models currently available to run tests")
 @mock_engine(rng_seed=0)
 def test_visual_clip(engine, visual_input):
-    from deepsparse import Pipeline
+    from deepsparse.legacy import Pipeline
 
     model_path = visual_input[-1]
     pipeline = Pipeline.create(task="clip_visual", model_path=model_path)
@@ -57,10 +83,9 @@ def test_visual_clip(engine, visual_input):
     assert len(output.image_embeddings) == 1
 
 
-@pytest.mark.skip(reason="No CLIP models curently available to run tests")
 @mock_engine(rng_seed=0)
 def test_text_clip(engine, text_input):
-    from deepsparse import Pipeline
+    from deepsparse.legacy import Pipeline
 
     model_path = text_input[-1]
     pipeline = Pipeline.create(task="clip_text", model_path=model_path)
@@ -70,18 +95,17 @@ def test_text_clip(engine, text_input):
     assert len(output.text_embeddings) == 1
 
 
-@pytest.mark.skip(reason="No CLIP models currently available to run tests")
 @mock_engine(rng_seed=0)
 def test_zero_shot(engine, visual_input, text_input):
-    from deepsparse.legacy import BasePipeline
+    from deepsparse.legacy import Pipeline
 
     model_path_text = text_input[-1]
     model_path_visual = visual_input[-1]
     kwargs = {
         "visual_model_path": model_path_visual,
         "text_model_path": model_path_text,
     }
-    pipeline = BasePipeline.create(task="clip_zeroshot", **kwargs)
+    pipeline = Pipeline.create(task="clip_zeroshot", **kwargs)
     assert isinstance(pipeline, CLIPZeroShotPipeline)
     pipeline_input = CLIPZeroShotInput(
         image=CLIPVisualInput(images=visual_input[0].images[-1]), text=text_input[0]
@@ -90,12 +114,12 @@ def test_zero_shot(engine, visual_input, text_input):
     assert isinstance(output, CLIPZeroShotOutput)
 
 
-@pytest.mark.skip(reason="No CLIP models currently available to run tests")
+@pytest.mark.skip(reason="No CLIP decoder models currently available to run tests")
 @mock_engine(rng_seed=0)
 def test_caption(engine, visual_input, text_input):
-    from deepsparse.legacy import BasePipeline
+    from deepsparse.legacy import Pipeline
 
-    model_path_visual = text_input[-1]
+    model_path_visual = visual_input[-1]
     model_path_text = text_input[-1]
     model_path_decoder = None
     pipeline_input = CLIPCaptionInput(
@@ -106,6 +130,6 @@ def test_caption(engine, visual_input, text_input):
         "text_model_path": model_path_text,
         "decoder_model_path": model_path_decoder,
     }
-    pipeline = BasePipeline.create(task="clip_caption", **kwargs)
+    pipeline = Pipeline.create(task="clip_caption", **kwargs)
     assert isinstance(pipeline, CLIPCaptionPipeline)
     assert isinstance(pipeline_input, CLIPCaptionInput)
diff --git a/tests/utils/engine_mocking.py b/tests/utils/engine_mocking.py
@@ -135,10 +135,17 @@ def execute_list_out(self, inputs: List[numpy.ndarray]) -> List[numpy.ndarray]:
 
 def _to_descriptor(node: ort.NodeArg) -> "_NumpyDescriptor":
     to_numpy_dtype = {
-        "tensor(float)": numpy.float32,
         "tensor(double)": numpy.float64,
-        "tensor(uint8)": numpy.uint8,
+        "tensor(float)": numpy.float32,
+        "tensor(float16)": numpy.float16,
         "tensor(int64)": numpy.int64,
+        "tensor(int32)": numpy.int32,
+        "tensor(int16)": numpy.int16,
+        "tensor(int8)": numpy.int8,
+        "tensor(uint64)": numpy.uint64,
+        "tensor(uint32)": numpy.uint32,
+        "tensor(uint16)": numpy.uint16,
+        "tensor(uint8)": numpy.uint8,
     }
     return _NumpyDescriptor(shape=node.shape, dtype=to_numpy_dtype[node.type])