Merge branch 'main' into test-optional-libraries

neuralmagic · May 14, 2024 · 8812e42 · 8812e42 · github-actions · May 14, 2024
2 parents c849276 + 3a25456
commit 8812e42
Show file tree

Hide file tree

Showing 296 changed files with 16,887 additions and 4,598 deletions.
diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py
@@ -0,0 +1,36 @@
+import os
+import zipfile
+
+MAX_SIZE_MB = 100
+
+
+def print_top_10_largest_files(zip_file):
+    with zipfile.ZipFile(zip_file, 'r') as z:
+        file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
+        file_sizes.sort(key=lambda x: x[1], reverse=True)
+        for f, size in file_sizes[:10]:
+            print(f"{f}: {size/(1024*1024)} MBs uncompressed.")
+
+
+def check_wheel_size(directory):
+    for root, _, files in os.walk(directory):
+        for f in files:
+            if f.endswith(".whl"):
+                wheel_path = os.path.join(root, f)
+                wheel_size = os.path.getsize(wheel_path)
+                wheel_size_mb = wheel_size / (1024 * 1024)
+                if wheel_size_mb > MAX_SIZE_MB:
+                    print(
+                        f"Wheel {wheel_path} is too large ({wheel_size_mb} MB) "
+                        f"compare to the allowed size ({MAX_SIZE_MB} MB).")
+                    print_top_10_largest_files(wheel_path)
+                    return 1
+                else:
+                    print(f"Wheel {wheel_path} is within the allowed size "
+                          f"({wheel_size_mb} MB).")
+    return 0
+
+
+if __name__ == "__main__":
+    import sys
+    sys.exit(check_wheel_size(sys.argv[1]))
diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
@@ -1,10 +1,11 @@
-# This script build the ROCm docker image and run the API server inside the container.
-# It serves a sanity check for compilation and basic model usage.
+# This script build the ROCm docker image and runs test inside it.
 set -ex
 
 # Print ROCm version
+echo "--- ROCm info"
 rocminfo
 
+echo "--- Resetting GPUs"
 
 echo "reset" > /opt/amdgpu/etc/gpu_state
 
@@ -16,37 +17,28 @@ while true; do
         fi
 done
 
+echo "--- Building container"
+sha=$(git rev-parse --short HEAD)
+container_name=rocm_${sha}
+docker build \
+        -t ${container_name} \
+        -f Dockerfile.rocm \
+        --progress plain \
+        .
+
+remove_docker_container() {
+   docker rm -f ${container_name} || docker image rm -f ${container_name} || true
+}
+trap remove_docker_container EXIT
 
+echo "--- Running container"
 
-# Try building the docker image
-docker build -t rocm -f Dockerfile.rocm .
-
-# Setup cleanup
-remove_docker_container() { docker rm -f rocm || true; }
-trap remove_docker_container EXIT
-remove_docker_container
-
-# Run the image
-export HIP_VISIBLE_DEVICES=1
-docker run --device /dev/kfd --device /dev/dri --network host -e HIP_VISIBLE_DEVICES --name rocm rocm python3 -m vllm.entrypoints.api_server &
-
-# Wait for the server to start
-wait_for_server_to_start() {
-    timeout=300
-    counter=0
-
-    while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
-        sleep 1
-        counter=$((counter + 1))
-        if [ $counter -ge $timeout ]; then
-            echo "Timeout after $timeout seconds"
-            break
-        fi
-    done
-}
-wait_for_server_to_start
+docker run \
+        --device /dev/kfd --device /dev/dri \
+        --network host \
+        --rm \
+        -e HF_TOKEN \
+        --name ${container_name} \
+        ${container_name} \
+        /bin/bash -c $(echo $1 | sed "s/^'//" | sed "s/'$//")
 
-# Test a simple prompt
-curl -X POST -H "Content-Type: application/json" \
-    localhost:8000/generate \
-    -d '{"prompt": "San Francisco is a"}'
diff --git a/.buildkite/run-benchmarks.sh b/.buildkite/run-benchmarks.sh
@@ -53,6 +53,11 @@ echo '```' >> benchmark_results.md
 tail -n 20 benchmark_serving.txt >> benchmark_results.md # last 20 lines
 echo '```' >> benchmark_results.md
 
+# if the agent binary is not found, skip uploading the results, exit 0
+if [ ! -f /workspace/buildkite-agent ]; then
+    exit 0
+fi
+
 # upload the results to buildkite
 /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
 

diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh
@@ -4,6 +4,20 @@ set -e
 
 # Try building the docker image
 aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
+
+# prune old image and containers to save disk space, and only once a day
+# by using a timestamp file in tmp.
+if [ -f /tmp/neuron-docker-build-timestamp ]; then
+    last_build=$(cat /tmp/neuron-docker-build-timestamp)
+    current_time=$(date +%s)
+    if [ $((current_time - last_build)) -gt 86400 ]; then
+        docker system prune -f
+        echo $current_time > /tmp/neuron-docker-build-timestamp
+    fi
+else
+    echo $(date +%s) > /tmp/neuron-docker-build-timestamp
+fi
+
 docker build -t neuron -f Dockerfile.neuron .
 
 # Setup cleanup

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -17,27 +17,38 @@ steps:
   - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
   - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
   - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
+  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
 
 - label: Core Test
+  mirror_hardwares: [amd]
   command: pytest -v -s core
 
 - label: Distributed Comm Ops Test
   command: pytest -v -s test_comm_ops.py
   working_dir: "/vllm-workspace/tests/distributed"
-  num_gpus: 2 # only support 1 or 2 for now.
+  num_gpus: 2
 
 - label: Distributed Tests
   working_dir: "/vllm-workspace/tests/distributed"
+
   num_gpus: 2 # only support 1 or 2 for now.
+  mirror_hardwares: [amd]
+
   commands:
-  - pytest -v -s test_pynccl.py
   - pytest -v -s test_pynccl_library.py
   - TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_chunked_prefill_distributed.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_chunked_prefill_distributed.py
 
+- label: Distributed Tests (Multiple Groups)
+  working_dir: "/vllm-workspace/tests/distributed"
+  num_gpus: 4
+  commands:
+  - pytest -v -s test_pynccl.py
+
 - label: Engine Test
+  mirror_hardwares: [amd]
   command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
 
 - label: Entrypoints Test
@@ -48,6 +59,7 @@ steps:
 
 - label: Examples Test
   working_dir: "/vllm-workspace/examples"
+  mirror_hardwares: [amd]
   commands:
     # install aws cli for llava_example.py
     - pip install awscli
@@ -61,29 +73,35 @@ steps:
   parallelism: 4
 
 - label: Models Test
+  mirror_hardwares: [amd]
   commands:
     - bash ../.buildkite/download-images.sh
     - pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py
 
 - label: Llava Test
+  mirror_hardwares: [amd]
   commands:
     - bash ../.buildkite/download-images.sh
     - pytest -v -s models/test_llava.py
 
 - label: Prefix Caching Test
+  mirror_hardwares: [amd]
   commands:
     - pytest -v -s prefix_caching
 
 - label: Samplers Test
   command: pytest -v -s samplers
 
 - label: LogitsProcessor Test
+  mirror_hardwares: [amd]
   command: pytest -v -s test_logits_processor.py
 
 - label: Worker Test
+  mirror_hardwares: [amd]
   command: pytest -v -s worker
 
 - label: Speculative decoding tests
+  mirror_hardwares: [amd]
   command: pytest -v -s spec_decode
 
 - label: LoRA Test %N
@@ -101,6 +119,7 @@ steps:
 
 - label: Benchmarks
   working_dir: "/vllm-workspace/.buildkite"
+  mirror_hardwares: [amd]
   commands:
   - pip install aiohttp
   - bash run-benchmarks.sh

diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2
@@ -16,17 +16,29 @@ steps:
           limit: 5
   - wait
 
-  - label: "AMD Test"
-    agents:
-      queue: amd
-    command: bash .buildkite/run-amd-test.sh
+  - group: "AMD Tests"
+    depends_on: ~
+    steps:
+    {% for step in steps %}
+    {% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
+      - label: "AMD: {{ step.label }}"
+        agents:
+          queue: amd
+        command: bash .buildkite/run-amd-test.sh "'cd {{ (step.working_dir or default_working_dir) | safe  }} && {{ step.command  or (step.commands | join(' && ')) | safe }}'"
+        env:
+          DOCKER_BUILDKIT: "1"
+    {% endif %}
+    {% endfor %}
 
   - label: "Neuron Test"
+    depends_on: ~
     agents:
       queue: neuron
     command: bash .buildkite/run-neuron-test.sh
+    soft_fail: true
 
-  - label: "CPU Test"
+  - label: "Intel Test"
+    depends_on: ~
     command: bash .buildkite/run-cpu-test.sh
 
   {% for step in steps %}
@@ -44,6 +56,9 @@ steps:
     plugins:
       - kubernetes:
           podSpec:
+            {% if step.num_gpus %}
+            priorityClassName: gpu-priority-cls-{{ step.num_gpus }}
+            {% endif %}
             volumes:
               - name: dshm
                 emptyDir:

diff --git a/.github/ISSUE_TEMPLATE/750-RFC.yml b/.github/ISSUE_TEMPLATE/750-RFC.yml
@@ -0,0 +1,49 @@
+name: 💬 Request for comments (RFC).
+description: Ask for feedback on major architectural changes or design choices.
+title: "[RFC]: "
+labels: ["RFC"]
+
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Please take a look at previous [RFCs](https://github.com/vllm-project/vllm/issues?q=label%3ARFC+sort%3Aupdated-desc) for reference.
+- type: textarea
+  attributes:
+    label: Motivation.
+    description: >
+      The motivation of the RFC.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Proposed Change.
+    description: >
+      The proposed change of the RFC.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Feedback Period.
+    description: >
+      The feedback period of the RFC. Usually at least one week.
+  validations:
+    required: false
+- type: textarea
+  attributes:
+    label: CC List.
+    description: >
+      The list of people you want to CC.
+  validations:
+    required: false
+- type: textarea
+  attributes:
+    label: Any Other Things.
+    description: >
+      Any other things you would like to mention.
+  validations:
+    required: false
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
diff --git a/.github/actions/nm-lm-eval-accuracy/action.yml b/.github/actions/nm-lm-eval-accuracy/action.yml
@@ -12,6 +12,10 @@ runs:
   steps:
   - id: lm-eval
     run: |
+      # move source directories
+      mv vllm vllm-ignore || echo "no 'vllm' folder to move"
+      mv csrc csrc-ignore || echo "no 'csrc' folder to move"
+
       COMMIT=${{ github.sha }}
       VENV="${{ inputs.venv }}-${COMMIT:0:7}"
       source $(pyenv root)/versions/${{ inputs.python }}/envs/${VENV}/bin/activate
@@ -20,7 +24,7 @@ runs:
       pip3 install pytest openai==1.3.9
 
       SUCCESS=0
-      pytest .github/scripts/test_lm_eval_sweep.py -s -v || SUCCESS=$?
+      pytest -v tests/accuracy/test_lm_eval_correctness.py || SUCCESS=$?
       echo "test=${SUCCESS}" >> "$GITHUB_OUTPUT"
       exit ${SUCCESS}
     shell: bash
diff --git a/.github/actions/nm-lm-eval-smoke/action.yml b/.github/actions/nm-lm-eval-smoke/action.yml
@@ -12,6 +12,10 @@ runs:
   steps:
   - id: lm-eval
     run: |
+      # move source directories
+      mv vllm vllm-ignore || echo "no 'vllm' folder to move"
+      mv csrc csrc-ignore || echo "no 'csrc' folder to move"
+
       COMMIT=${{ github.sha }}
       VENV="${{ inputs.venv }}-${COMMIT:0:7}"
       source $(pyenv root)/versions/${{ inputs.python }}/envs/${VENV}/bin/activate

diff --git a/.github/data/nm_benchmark_weekly_configs_list.txt b/.github/data/nm_benchmark_weekly_configs_list.txt
@@ -0,0 +1,5 @@
+neuralmagic/benchmarks/configs/benchmark_serving.json
+neuralmagic/benchmarks/configs/benchmark_throughput.json
+neuralmagic/benchmarks/configs/benchmark_throughput_decode.json
+neuralmagic/benchmarks/configs/benchmark_throughput_prefill.json
+neuralmagic/benchmarks/configs/benchmark_remote_push.json
diff --git a/.github/scripts/lm_eval_compare_hf_vs_vllm.py b/.github/scripts/lm_eval_compare_hf_vs_vllm.py
@@ -38,7 +38,7 @@ def print_results(data_to_print: List = None,
 def check_passing_score(results_dict: Dict = None,
                         alpha: float = None) -> bool:
     for task in results_dict:
-        p_value = task["p_value"]
+        p_value = results_dict[task]["p_value"]
         if p_value <= alpha:
             return False
     return True
@@ -120,6 +120,6 @@ def parse_args():
         all_res[task1[0]] = {"z": z, "p_value": p_value}
     print_results([results_hf["results"], results_vllm["results"]], all_res,
                   args.alpha)
-    if not check_passing_score:
+    if not check_passing_score(all_res, args.alpha):
         print("Accuracy test failed!")
         exit(1)
diff --git a/.github/scripts/run-tests b/.github/scripts/run-tests
@@ -117,6 +117,8 @@ do
         CUDA_VISIBLE_DEVICES=0,1 pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
     elif [[ "${TEST}" == *"test_models_logprobs"* ]]; then
         pytest --forked ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
+    elif [[ "${TEST}" == *"basic_correctness/test_preemption"* ]]; then
+        VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
     else
         pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
     fi
Benchmark suite	Current: `8812e42`	Previous: `d485d3e`	Ratio
`{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.2.0", "python_version": "3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]", "torch_version": "2.3.0+cu121"}`	`3.8388254207613244` prompts/s	`3.8408865302828166` prompts/s	`1.00`
`{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.2.0", "python_version": "3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]", "torch_version": "2.3.0+cu121"}`	`1474.1089615723486` tokens/s	`1474.9004276286016` tokens/s	`1.00`