neuralmagic · mwitiderrick · Jun 7, 2023 · Jun 7, 2023 · Jun 7, 2023 · Jun 7, 2023
diff --git a/examples/aws-text-benchmarks/benchmark_deepsparse.py b/examples/aws-text-benchmarks/benchmark_deepsparse.py
@@ -0,0 +1,66 @@
+from deepsparse import Pipeline, Context
+import deepsparse.transformers
+from datasets import load_dataset
+from transformers import AutoTokenizer
+from tqdm import tqdm
+import numpy as np
+import time, os
+
+os.environ["NM_BIND_THREADS_TO_CORES"] = "1"
+INPUT_COL = "text"
+dataset = load_dataset("ag_news", split="train[:3000]")
+batch_size = 64
+buckets = [64, 128, 256]
+model_path = "./sparse-model/deployment/"
+
+### TOKENIZE DATASET - (used to comptue buckets)
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+
+
+def pre_process_fn(examples):
+    return tokenizer(examples[INPUT_COL], add_special_tokens=True, return_tensors="np", padding=False, truncation=False)
+
+
+dataset = dataset.map(pre_process_fn, batched=True)
+dataset = dataset.add_column("num_tokens", list(map(len, dataset["input_ids"])))
+dataset = dataset.sort("num_tokens")
+max_token_len = dataset[-1]["num_tokens"]
+
+### SPLIT DATA INTO BATCHES
+num_pad_items = batch_size - (dataset.num_rows % batch_size)
+inputs = ([""] * num_pad_items) + dataset[INPUT_COL]
+batches = []
+
+for b_index_start in range(0, len(inputs), batch_size):
+    batches.append(inputs[b_index_start:b_index_start + batch_size])
+
+## RUN THROUPUT TESTING
+print("\nCompiling models:")
+
+tc_pipeline = Pipeline.create(
+    task="zero_shot_text_classification",
+    model_path=model_path,
+    model_scheme="mnli",
+    sequence_length=buckets,
+    batch_size=batch_size,
+    context=Context(num_streams=1)
+)
+print("\nRunning test:")
+# run inferences on the datset
+start = time.perf_counter()
+
+predictions = []
+for batch in tqdm(batches):
+    predictions.append(tc_pipeline(sequences=batch, labels=['Sports', 'Business', 'Sci/Tech']))
+
+# flatten and remove padded predictions
+predictions = [pred for sublist in predictions for pred in sublist.labels]
+predictions = predictions[num_pad_items:]
+end = time.perf_counter()
+
+# compute throughput
+total_time_executing = end - start
+print(f"Total time: {total_time_executing}")
+items_per_sec = len(predictions) / total_time_executing
+
+print(f"Items Per Second: {items_per_sec}")
diff --git a/examples/aws-text-benchmarks/benchmark_huggingface.py b/examples/aws-text-benchmarks/benchmark_huggingface.py
@@ -0,0 +1,49 @@
+from datasets import load_dataset
+from transformers import pipeline, AutoTokenizer
+from transformers.pipelines.pt_utils import KeyDataset
+from tqdm import tqdm
+import time
+import torch
+model_path = "./dense-model/training/"
+batch_size = 64
+
+### SETUP DATASETS - in this case, we download ag_news
+print("Setting up the dataset:")
+
+INPUT_COL = "text"
+dataset = load_dataset("ag_news", split="train[:3000]")
+
+### TOKENIZE DATASETS - to sort dataset
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+
+def pre_process_fn(examples):
+    return tokenizer(examples[INPUT_COL], add_special_tokens=True, return_tensors="np", padding=False, truncation=False)
+
+dataset = dataset.map(pre_process_fn, batched=True)
+dataset = dataset.add_column("num_tokens", list(map(len, dataset["input_ids"])))
+dataset = dataset.sort("num_tokens")
+
+### SPLIT DATA INTO BATCHES
+hf_dataset = KeyDataset(dataset, INPUT_COL)
+
+### RUN THROUGPUT TESTING
+# load model
+hf_pipeline = pipeline("zero-shot-classification", model_path, batch_size=batch_size,device=("cuda:0" if torch.cuda.is_available() else "cpu"), )
+
+# run inferences
+start = time.perf_counter()
+
+predictions = []
+for prediction in hf_pipeline(hf_dataset,candidate_labels=['Sports', 'Business', 'Sci/Tech']):
+    predictions.append(prediction)
+
+# torch.cuda.synchronize()
+
+end = time.perf_counter()
+
+# compute throughput
+total_time_executing = end - start
+items_per_sec = len(predictions) / total_time_executing
+
+print(f"Total time: {total_time_executing}")
+print(f"Items Per Second: {items_per_sec}")
diff --git a/examples/aws-text-benchmarks/image.png b/examples/aws-text-benchmarks/image.png
diff --git a/examples/aws-text-benchmarks/readme.md b/examples/aws-text-benchmarks/readme.md
@@ -0,0 +1,49 @@
+This repo contains example benchmarking scripts for computing throughput of DeepSparse with a sparse model and throughput of HuggingFace + PyTorch on a GPU with a dense model.
+
+In this example, we run on the `ag_news` dataset with models downloaded from SparseZoo.
+
+## Sparse Model DeepSparse
+
+Install DeepSparse:
+
+```bash
+pip install deepsparse
+```
+
+Download Sparse Model:
+
+```bash
+sparsezoo.download zoo:nlp/text_classification/bert-large/pytorch/huggingface/mnli/pruned90_quant-none --save-dir ./sparse-model
+```
+
+Run DeepSparse Benchmark (creates buckets for token len 64, 128, and 256):
+
+```bash
+python benchmark_deepsparse.py
+```
+
+Note: DeepSparse uses static input shapes. Since the distribution of inputs for a dataset will be varied (multiple different sequence lengths), 
+we can use bucketing where we compile DeepSparse with multiple input shapes and dynamically route inputs.
+In the case of `ag_news` (the example dataset in this case), the distribution of token lengths looks like the following:
+![Histogram](image.png)
+
+As such, we used buckets of length 64, 128, and 256. DeepSparse runs best with sequence lengths that are multiples of 16.
+
+## Dense Model GPU
+
+Install `transformers` and `datasets`:
+```
+pip install transformers[torch]
+pip install datasets
+pip install sparzeoo
+```
+
+Download Dense Model:
+```bash
+sparsezoo.download zoo:nlp/text_classification/bert-large/pytorch/huggingface/mnli/base-none --save-dir ./dense-model
+```
+
+Run HF Benchmark (on GPU):
+```
+python benchmark_huggingface.py 
+```