Skip to content

Commit

Permalink
rebase
Browse files Browse the repository at this point in the history
  • Loading branch information
bogunowicz@arrival.com committed May 7, 2024
2 parents 1fda4fc + 5c7a85d commit 13a1f5b
Show file tree
Hide file tree
Showing 268 changed files with 10,240 additions and 3,104 deletions.
16 changes: 15 additions & 1 deletion .buildkite/run-amd-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,19 @@ set -ex
# Print ROCm version
rocminfo


echo "reset" > /opt/amdgpu/etc/gpu_state

while true; do
sleep 3
if grep -q clean /opt/amdgpu/etc/gpu_state; then
echo "GPUs state is \"clean\""
break
fi
done



# Try building the docker image
docker build -t rocm -f Dockerfile.rocm .

Expand All @@ -14,7 +27,8 @@ trap remove_docker_container EXIT
remove_docker_container

# Run the image
docker run --device /dev/kfd --device /dev/dri --network host --name rocm rocm python3 -m vllm.entrypoints.api_server &
export HIP_VISIBLE_DEVICES=1
docker run --device /dev/kfd --device /dev/dri --network host -e HIP_VISIBLE_DEVICES --name rocm rocm python3 -m vllm.entrypoints.api_server &

# Wait for the server to start
wait_for_server_to_start() {
Expand Down
37 changes: 37 additions & 0 deletions .buildkite/run-neuron-test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# This script build the Neuron docker image and run the API server inside the container.
# It serves a sanity check for compilation and basic model usage.
set -e

# Try building the docker image
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
docker build -t neuron -f Dockerfile.neuron .

# Setup cleanup
remove_docker_container() { docker rm -f neuron || true; }
trap remove_docker_container EXIT
remove_docker_container

# Run the image
docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
--model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &

# Wait for the server to start
wait_for_server_to_start() {
timeout=300
counter=0

while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
sleep 1
counter=$((counter + 1))
if [ $counter -ge $timeout ]; then
echo "Timeout after $timeout seconds"
break
fi
done
}
wait_for_server_to_start

# Test a simple prompt
curl -X POST -H "Content-Type: application/json" \
localhost:8000/generate \
-d '{"prompt": "San Francisco is a"}'
15 changes: 13 additions & 2 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,11 @@ steps:
command: pytest -v -s async_engine

- label: Basic Correctness Test
command: pytest -v -s basic_correctness
commands:
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py

- label: Core Test
command: pytest -v -s core
Expand All @@ -27,13 +31,14 @@ steps:
num_gpus: 2 # only support 1 or 2 for now.
commands:
- pytest -v -s test_pynccl.py
- pytest -v -s test_pynccl_library.py
- TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_basic_distributed_correctness.py
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_basic_distributed_correctness.py
- TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_chunked_prefill_distributed.py
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_chunked_prefill_distributed.py

- label: Engine Test
command: pytest -v -s engine tokenization test_sequence.py test_config.py
command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py

- label: Entrypoints Test
commands:
Expand Down Expand Up @@ -85,9 +90,15 @@ steps:
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 4

- label: Tensorizer Test
command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader

- label: Metrics Test
command: pytest -v -s metrics

- label: Quantization Test
command: pytest -v -s quantization

- label: Benchmarks
working_dir: "/vllm-workspace/.buildkite"
commands:
Expand Down
20 changes: 13 additions & 7 deletions .buildkite/test-template.j2
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,6 @@
{% set default_working_dir = "/vllm-workspace/tests" %}

steps:
- label: "AMD Test"
agents:
queue: amd
command: bash .buildkite/run-amd-test.sh

- label: "CPU Test"
command: bash .buildkite/run-cpu-test.sh

- label: ":docker: build image"
commands:
Expand All @@ -23,6 +16,19 @@ steps:
limit: 5
- wait

- label: "AMD Test"
agents:
queue: amd
command: bash .buildkite/run-amd-test.sh

- label: "Neuron Test"
agents:
queue: neuron
command: bash .buildkite/run-neuron-test.sh

- label: "CPU Test"
command: bash .buildkite/run-cpu-test.sh

{% for step in steps %}
- label: "{{ step.label }}"
agents:
Expand Down
1 change: 1 addition & 0 deletions .github/ISSUE_TEMPLATE/200-installation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ body:
# For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py
```
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
value: |
```text
The output of `python collect_env.py`
Expand Down
1 change: 1 addition & 0 deletions .github/ISSUE_TEMPLATE/300-usage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ body:
# For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py
```
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
value: |
```text
The output of `python collect_env.py`
Expand Down
3 changes: 3 additions & 0 deletions .github/ISSUE_TEMPLATE/400-bug report.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ body:
# For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py
```
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
value: |
```text
The output of `python collect_env.py`
Expand Down Expand Up @@ -57,6 +58,8 @@ body:
If the code is too long (hopefully, it isn't), feel free to put it in a public gist and link it in the issue: https://gist.github.com.
Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````.
If you experienced crashes or hangs, it would be helpful to run vllm with `export VLLM_TRACE_FUNCTION=1` . All the function calls in vllm will be recorded. Inspect these log files, and tell which function crashes or hangs.
placeholder: |
A clear and concise description of what the bug is.
Expand Down
1 change: 1 addition & 0 deletions .github/ISSUE_TEMPLATE/700-performance discussion.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ body:
# For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py
```
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
value: |
```text
The output of `python collect_env.py`
Expand Down
3 changes: 3 additions & 0 deletions .github/actions/nm-benchmark/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ runs:
- id: benchmark
run: |
mkdir -p ${{ inputs.output_directory }}
# move source directories
mv vllm vllm-ignore || echo "no 'vllm' folder to move"
mv csrc csrc-ignore || echo "no 'csrc' folder to move"
COMMIT=${{ github.sha }}
VENV="${{ inputs.venv }}-${COMMIT:0:7}"
source $(pyenv root)/versions/${{ inputs.python }}/envs/${VENV}/bin/activate
Expand Down
6 changes: 2 additions & 4 deletions .github/actions/nm-install-test-whl/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,12 @@ runs:
pip3 install coverage
pip3 install pytest-cov
pip3 install pytest-xdist
pip3 install --index-url http://${{ inputs.pypi }}:8080/ --trusted-host ${{ inputs.pypi }} nm-magic-wand-nightly
pip3 list
pip3 install -r requirements-dev.txt
BASE=$(./.github/scripts/convert-version ${{ inputs.python }})
WHL=$(find . -type f -iname "*${BASE}*.whl")
WHL_BASENAME=$(basename ${WHL})
echo "whl=${WHL_BASENAME}" >> "$GITHUB_OUTPUT"
pip3 install ${WHL}
pip3 install -r requirements-dev.txt
pip3 install ${WHL}[sparse]
# report magic_wand version
MAGIC_WAND=$(pip3 show nm-magic-wand-nightly | grep "Version" | cut -d' ' -f2)
echo "magic_wand=${MAGIC_WAND}" >> "$GITHUB_OUTPUT"
Expand Down
27 changes: 27 additions & 0 deletions .github/actions/nm-install-whl/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: install whl
description: 'installs found whl based on python version into specified venv'
inputs:
python:
description: 'python version, e.g. 3.10.12'
required: true
venv:
description: 'name for python virtual environment'
required: true
runs:
using: composite
steps:
- id: install_whl
run: |
# move source directories
mv vllm vllm-ignore
mv csrc csrc-ignore
# activate and install
COMMIT=${{ github.sha }}
VENV="${{ env.VENV_BASE }}-${COMMIT:0:7}"
source $(pyenv root)/versions/${{ inputs.python }}/envs/${VENV}/bin/activate
pip3 install -r requirements-dev.txt
BASE=$(./.github/scripts/convert-version ${{ inputs.python }})
WHL=$(find . -type f -iname "*${BASE}*.whl")
WHL_BASENAME=$(basename ${WHL})
pip3 install ${WHL}[sparse]
shell: bash
2 changes: 1 addition & 1 deletion .github/actions/nm-set-python/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ runs:
pyenv local ${{ inputs.python }}
COMMIT=${{ github.sha }}
VENV="${{ inputs.venv }}-${COMMIT:0:7}"
pyenv virtualenv ${VENV} || true
pyenv virtualenv --force ${VENV}
source $(pyenv root)/versions/${{ inputs.python }}/envs/${VENV}/bin/activate
VERSION=$(python --version)
echo "version=${VERSION}" >> "$GITHUB_OUTPUT"
Expand Down
6 changes: 3 additions & 3 deletions .github/scripts/nm-run-benchmarks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,18 @@

set -e
set -u

if [ $# -ne 2 ];
then
echo "run_benchmarks needs exactly 2 arguments: "
echo " 1. Path to a .txt file containing the list of benchmark config paths"
echo " 2. The output path to store the benchmark results"
exit 1
fi

benchmark_config_list_file=$1
output_directory=$2

for bench_config in `cat $benchmark_config_list_file`
do
echo "Running benchmarks for config " $bench_config
Expand Down
4 changes: 2 additions & 2 deletions .github/scripts/run-tests
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,8 @@ do
# need to be run with specific options
if [[ "${TEST}" == *"kernels"* || "${TEST}" == *"samplers"* ]]; then
CUDA_VISIBLE_DEVICES=0,1 pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
elif [[ "${TEST}" == *"test_basic_distributed_correctness"* ]]; then
CUDA_VISIBLE_DEVICES=0,1 TEST_DIST_MODEL=facebook/opt-125m pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
elif [[ "${TEST}" == *"distributed"* ]]; then
CUDA_VISIBLE_DEVICES=0,1 pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
elif [[ "${TEST}" == *"test_models_logprobs"* ]]; then
pytest --forked ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
else
Expand Down

1 comment on commit 13a1f5b

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

bigger_is_better

Benchmark suite Current: 13a1f5b Previous: df1f1a0 Ratio
{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.2.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.2.1+cu121"} 4.026795745265708 prompts/s 3.80234884054723 prompts/s 0.94
{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.2.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.2.1+cu121"} 1546.2895661820319 tokens/s 1460.1019547701362 tokens/s 0.94

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.