rebase

neuralmagic · May 7, 2024 · 13a1f5b · 13a1f5b · github-actions · May 7, 2024
2 parents 1fda4fc + 5c7a85d
commit 13a1f5b
Show file tree

Hide file tree

Showing 268 changed files with 10,240 additions and 3,104 deletions.
diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
@@ -5,6 +5,19 @@ set -ex
 # Print ROCm version
 rocminfo
 
+
+echo "reset" > /opt/amdgpu/etc/gpu_state
+
+while true; do
+        sleep 3
+        if grep -q clean /opt/amdgpu/etc/gpu_state; then
+                echo "GPUs state is \"clean\""
+                break
+        fi
+done
+
+
+
 # Try building the docker image
 docker build -t rocm -f Dockerfile.rocm .
 
@@ -14,7 +27,8 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image
-docker run --device /dev/kfd --device /dev/dri --network host --name rocm rocm python3 -m vllm.entrypoints.api_server &
+export HIP_VISIBLE_DEVICES=1
+docker run --device /dev/kfd --device /dev/dri --network host -e HIP_VISIBLE_DEVICES --name rocm rocm python3 -m vllm.entrypoints.api_server &
 
 # Wait for the server to start
 wait_for_server_to_start() {

diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh
@@ -0,0 +1,37 @@
+# This script build the Neuron docker image and run the API server inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -e
+
+# Try building the docker image
+aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
+docker build -t neuron -f Dockerfile.neuron .
+
+# Setup cleanup
+remove_docker_container() { docker rm -f neuron || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image
+docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
+       --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
+
+# Wait for the server to start
+wait_for_server_to_start() {
+    timeout=300
+    counter=0
+
+    while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
+        sleep 1
+        counter=$((counter + 1))
+        if [ $counter -ge $timeout ]; then
+            echo "Timeout after $timeout seconds"
+            break
+        fi
+    done
+}
+wait_for_server_to_start
+
+# Test a simple prompt
+curl -X POST -H "Content-Type: application/json" \
+    localhost:8000/generate \
+    -d '{"prompt": "San Francisco is a"}'
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -12,7 +12,11 @@ steps:
   command: pytest -v -s async_engine
 
 - label: Basic Correctness Test
-  command: pytest -v -s basic_correctness
+  commands:
+  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
+  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
+  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
+  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
 
 - label: Core Test
   command: pytest -v -s core
@@ -27,13 +31,14 @@ steps:
   num_gpus: 2 # only support 1 or 2 for now.
   commands:
   - pytest -v -s test_pynccl.py
+  - pytest -v -s test_pynccl_library.py
   - TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_chunked_prefill_distributed.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_chunked_prefill_distributed.py
 
 - label: Engine Test
-  command: pytest -v -s engine tokenization test_sequence.py test_config.py
+  command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
 
 - label: Entrypoints Test
   commands:
@@ -85,9 +90,15 @@ steps:
   command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
   parallelism: 4
 
+- label: Tensorizer Test
+  command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader
+
 - label: Metrics Test
   command: pytest -v -s metrics
 
+- label: Quantization Test
+  command: pytest -v -s quantization
+
 - label: Benchmarks
   working_dir: "/vllm-workspace/.buildkite"
   commands:

diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2
@@ -3,13 +3,6 @@
 {% set default_working_dir = "/vllm-workspace/tests" %}
 
 steps:
-  - label: "AMD Test"
-    agents:
-      queue: amd
-    command: bash .buildkite/run-amd-test.sh
-
-  - label: "CPU Test"
-    command: bash .buildkite/run-cpu-test.sh
 
   - label: ":docker: build image"
     commands:
@@ -23,6 +16,19 @@ steps:
           limit: 5
   - wait
 
+  - label: "AMD Test"
+    agents:
+      queue: amd
+    command: bash .buildkite/run-amd-test.sh
+
+  - label: "Neuron Test"
+    agents:
+      queue: neuron
+    command: bash .buildkite/run-neuron-test.sh
+
+  - label: "CPU Test"
+    command: bash .buildkite/run-cpu-test.sh
+
   {% for step in steps %}
   - label: "{{ step.label }}"
     agents:

diff --git a/.github/ISSUE_TEMPLATE/200-installation.yml b/.github/ISSUE_TEMPLATE/200-installation.yml
@@ -18,6 +18,7 @@ body:
       # For security purposes, please feel free to check the contents of collect_env.py before running it.
       python collect_env.py
       ```
+      It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
     value: |
       ```text
       The output of `python collect_env.py`

diff --git a/.github/ISSUE_TEMPLATE/300-usage.yml b/.github/ISSUE_TEMPLATE/300-usage.yml
@@ -18,6 +18,7 @@ body:
       # For security purposes, please feel free to check the contents of collect_env.py before running it.
       python collect_env.py
       ```
+      It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
     value: |
       ```text
       The output of `python collect_env.py`

diff --git a/.github/ISSUE_TEMPLATE/400-bug report.yml b/.github/ISSUE_TEMPLATE/400-bug report.yml
@@ -18,6 +18,7 @@ body:
       # For security purposes, please feel free to check the contents of collect_env.py before running it.
       python collect_env.py
       ```
+      It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
     value: |
       ```text
       The output of `python collect_env.py`
@@ -57,6 +58,8 @@ body:
       If the code is too long (hopefully, it isn't), feel free to put it in a public gist and link it in the issue: https://gist.github.com.
 
       Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````.
+
+      If you experienced crashes or hangs, it would be helpful to run vllm with `export VLLM_TRACE_FUNCTION=1` . All the function calls in vllm will be recorded. Inspect these log files, and tell which function crashes or hangs.
     placeholder: |
       A clear and concise description of what the bug is.
 

diff --git a/.github/ISSUE_TEMPLATE/700-performance discussion.yml b/.github/ISSUE_TEMPLATE/700-performance discussion.yml
@@ -39,6 +39,7 @@ body:
       # For security purposes, please feel free to check the contents of collect_env.py before running it.
       python collect_env.py
       ```
+      It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
     value: |
       ```text
       The output of `python collect_env.py`

diff --git a/.github/actions/nm-benchmark/action.yml b/.github/actions/nm-benchmark/action.yml
@@ -19,6 +19,9 @@ runs:
   - id: benchmark
     run: |
       mkdir -p ${{ inputs.output_directory }}
+      # move source directories
+      mv vllm vllm-ignore || echo "no 'vllm' folder to move"
+      mv csrc csrc-ignore || echo "no 'csrc' folder to move"
       COMMIT=${{ github.sha }}
       VENV="${{ inputs.venv }}-${COMMIT:0:7}"
       source $(pyenv root)/versions/${{ inputs.python }}/envs/${VENV}/bin/activate

diff --git a/.github/actions/nm-install-test-whl/action.yml b/.github/actions/nm-install-test-whl/action.yml
@@ -44,14 +44,12 @@ runs:
         pip3 install coverage
         pip3 install pytest-cov
         pip3 install pytest-xdist
-        pip3 install --index-url http://${{ inputs.pypi }}:8080/ --trusted-host ${{ inputs.pypi }} nm-magic-wand-nightly
-        pip3 list
+        pip3 install -r requirements-dev.txt
         BASE=$(./.github/scripts/convert-version ${{ inputs.python }})
         WHL=$(find . -type f -iname "*${BASE}*.whl")
         WHL_BASENAME=$(basename ${WHL})
         echo "whl=${WHL_BASENAME}" >> "$GITHUB_OUTPUT"
-        pip3 install ${WHL}
-        pip3 install -r requirements-dev.txt
+        pip3 install ${WHL}[sparse]
         # report magic_wand version
         MAGIC_WAND=$(pip3 show nm-magic-wand-nightly | grep "Version" | cut -d' ' -f2)
         echo "magic_wand=${MAGIC_WAND}" >> "$GITHUB_OUTPUT"

diff --git a/.github/actions/nm-install-whl/action.yml b/.github/actions/nm-install-whl/action.yml
@@ -0,0 +1,27 @@
+name: install whl
+description: 'installs found whl based on python version into specified venv'
+inputs:
+  python:
+    description: 'python version, e.g. 3.10.12'
+    required: true
+  venv:
+    description: 'name for python virtual environment'
+    required: true
+runs:
+  using: composite
+  steps:
+    - id: install_whl
+      run: |
+        # move source directories
+        mv vllm vllm-ignore
+        mv csrc csrc-ignore
+        # activate and install
+        COMMIT=${{ github.sha }}
+        VENV="${{ env.VENV_BASE }}-${COMMIT:0:7}"
+        source $(pyenv root)/versions/${{ inputs.python }}/envs/${VENV}/bin/activate
+        pip3 install -r requirements-dev.txt
+        BASE=$(./.github/scripts/convert-version ${{ inputs.python }})
+        WHL=$(find . -type f -iname "*${BASE}*.whl")
+        WHL_BASENAME=$(basename ${WHL})
+        pip3 install ${WHL}[sparse]
+      shell: bash
diff --git a/.github/actions/nm-set-python/action.yml b/.github/actions/nm-set-python/action.yml
@@ -20,7 +20,7 @@ runs:
         pyenv local ${{ inputs.python }}
         COMMIT=${{ github.sha }}
         VENV="${{ inputs.venv }}-${COMMIT:0:7}"
-        pyenv virtualenv ${VENV} || true
+        pyenv virtualenv --force ${VENV}
         source $(pyenv root)/versions/${{ inputs.python }}/envs/${VENV}/bin/activate
         VERSION=$(python --version)
         echo "version=${VERSION}" >> "$GITHUB_OUTPUT"

diff --git a/.github/scripts/nm-run-benchmarks.sh b/.github/scripts/nm-run-benchmarks.sh
@@ -3,18 +3,18 @@
 
 set -e
 set -u
-  
+
 if [ $# -ne 2 ];
 then
   echo "run_benchmarks needs exactly 2 arguments: "
   echo " 1. Path to a .txt file containing the list of benchmark config paths"
   echo " 2. The output path to store the benchmark results"
   exit 1
 fi
-  
+
 benchmark_config_list_file=$1
 output_directory=$2
-  
+
 for bench_config in `cat $benchmark_config_list_file`
 do
   echo "Running benchmarks for config " $bench_config

diff --git a/.github/scripts/run-tests b/.github/scripts/run-tests
@@ -113,8 +113,8 @@ do
     # need to be run with specific options
     if [[ "${TEST}" == *"kernels"* || "${TEST}" == *"samplers"* ]]; then
         CUDA_VISIBLE_DEVICES=0,1 pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
-    elif [[ "${TEST}" == *"test_basic_distributed_correctness"* ]]; then
-        CUDA_VISIBLE_DEVICES=0,1 TEST_DIST_MODEL=facebook/opt-125m pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
+    elif [[ "${TEST}" == *"distributed"* ]]; then
+        CUDA_VISIBLE_DEVICES=0,1 pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
     elif [[ "${TEST}" == *"test_models_logprobs"* ]]; then
         pytest --forked ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
     else
Benchmark suite	Current: `13a1f5b`	Previous: `df1f1a0`	Ratio
`{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.2.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.2.1+cu121"}`	`4.026795745265708` prompts/s	`3.80234884054723` prompts/s	`0.94`
`{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.2.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.2.1+cu121"}`	`1546.2895661820319` tokens/s	`1460.1019547701362` tokens/s	`0.94`