Skip to content

Commit

Permalink
Merge branch 'main' into test-optional-libraries
Browse files Browse the repository at this point in the history
  • Loading branch information
mgoin committed May 14, 2024
2 parents c849276 + 3a25456 commit 8812e42
Show file tree
Hide file tree
Showing 296 changed files with 16,887 additions and 4,598 deletions.
36 changes: 36 additions & 0 deletions .buildkite/check-wheel-size.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import os
import zipfile

MAX_SIZE_MB = 100


def print_top_10_largest_files(zip_file):
with zipfile.ZipFile(zip_file, 'r') as z:
file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
file_sizes.sort(key=lambda x: x[1], reverse=True)
for f, size in file_sizes[:10]:
print(f"{f}: {size/(1024*1024)} MBs uncompressed.")


def check_wheel_size(directory):
for root, _, files in os.walk(directory):
for f in files:
if f.endswith(".whl"):
wheel_path = os.path.join(root, f)
wheel_size = os.path.getsize(wheel_path)
wheel_size_mb = wheel_size / (1024 * 1024)
if wheel_size_mb > MAX_SIZE_MB:
print(
f"Wheel {wheel_path} is too large ({wheel_size_mb} MB) "
f"compare to the allowed size ({MAX_SIZE_MB} MB).")
print_top_10_largest_files(wheel_path)
return 1
else:
print(f"Wheel {wheel_path} is within the allowed size "
f"({wheel_size_mb} MB).")
return 0


if __name__ == "__main__":
import sys
sys.exit(check_wheel_size(sys.argv[1]))
58 changes: 25 additions & 33 deletions .buildkite/run-amd-test.sh
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
# This script build the ROCm docker image and run the API server inside the container.
# It serves a sanity check for compilation and basic model usage.
# This script build the ROCm docker image and runs test inside it.
set -ex

# Print ROCm version
echo "--- ROCm info"
rocminfo

echo "--- Resetting GPUs"

echo "reset" > /opt/amdgpu/etc/gpu_state

Expand All @@ -16,37 +17,28 @@ while true; do
fi
done

echo "--- Building container"
sha=$(git rev-parse --short HEAD)
container_name=rocm_${sha}
docker build \
-t ${container_name} \
-f Dockerfile.rocm \
--progress plain \
.

remove_docker_container() {
docker rm -f ${container_name} || docker image rm -f ${container_name} || true
}
trap remove_docker_container EXIT

echo "--- Running container"

# Try building the docker image
docker build -t rocm -f Dockerfile.rocm .

# Setup cleanup
remove_docker_container() { docker rm -f rocm || true; }
trap remove_docker_container EXIT
remove_docker_container

# Run the image
export HIP_VISIBLE_DEVICES=1
docker run --device /dev/kfd --device /dev/dri --network host -e HIP_VISIBLE_DEVICES --name rocm rocm python3 -m vllm.entrypoints.api_server &

# Wait for the server to start
wait_for_server_to_start() {
timeout=300
counter=0

while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
sleep 1
counter=$((counter + 1))
if [ $counter -ge $timeout ]; then
echo "Timeout after $timeout seconds"
break
fi
done
}
wait_for_server_to_start
docker run \
--device /dev/kfd --device /dev/dri \
--network host \
--rm \
-e HF_TOKEN \
--name ${container_name} \
${container_name} \
/bin/bash -c $(echo $1 | sed "s/^'//" | sed "s/'$//")

# Test a simple prompt
curl -X POST -H "Content-Type: application/json" \
localhost:8000/generate \
-d '{"prompt": "San Francisco is a"}'
5 changes: 5 additions & 0 deletions .buildkite/run-benchmarks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,11 @@ echo '```' >> benchmark_results.md
tail -n 20 benchmark_serving.txt >> benchmark_results.md # last 20 lines
echo '```' >> benchmark_results.md

# if the agent binary is not found, skip uploading the results, exit 0
if [ ! -f /workspace/buildkite-agent ]; then
exit 0
fi

# upload the results to buildkite
/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md

Expand Down
14 changes: 14 additions & 0 deletions .buildkite/run-neuron-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,20 @@ set -e

# Try building the docker image
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com

# prune old image and containers to save disk space, and only once a day
# by using a timestamp file in tmp.
if [ -f /tmp/neuron-docker-build-timestamp ]; then
last_build=$(cat /tmp/neuron-docker-build-timestamp)
current_time=$(date +%s)
if [ $((current_time - last_build)) -gt 86400 ]; then
docker system prune -f
echo $current_time > /tmp/neuron-docker-build-timestamp
fi
else
echo $(date +%s) > /tmp/neuron-docker-build-timestamp
fi

docker build -t neuron -f Dockerfile.neuron .

# Setup cleanup
Expand Down
23 changes: 21 additions & 2 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,27 +17,38 @@ steps:
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py

- label: Core Test
mirror_hardwares: [amd]
command: pytest -v -s core

- label: Distributed Comm Ops Test
command: pytest -v -s test_comm_ops.py
working_dir: "/vllm-workspace/tests/distributed"
num_gpus: 2 # only support 1 or 2 for now.
num_gpus: 2

- label: Distributed Tests
working_dir: "/vllm-workspace/tests/distributed"

num_gpus: 2 # only support 1 or 2 for now.
mirror_hardwares: [amd]

commands:
- pytest -v -s test_pynccl.py
- pytest -v -s test_pynccl_library.py
- TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_basic_distributed_correctness.py
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_basic_distributed_correctness.py
- TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_chunked_prefill_distributed.py
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_chunked_prefill_distributed.py

- label: Distributed Tests (Multiple Groups)
working_dir: "/vllm-workspace/tests/distributed"
num_gpus: 4
commands:
- pytest -v -s test_pynccl.py

- label: Engine Test
mirror_hardwares: [amd]
command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py

- label: Entrypoints Test
Expand All @@ -48,6 +59,7 @@ steps:

- label: Examples Test
working_dir: "/vllm-workspace/examples"
mirror_hardwares: [amd]
commands:
# install aws cli for llava_example.py
- pip install awscli
Expand All @@ -61,29 +73,35 @@ steps:
parallelism: 4

- label: Models Test
mirror_hardwares: [amd]
commands:
- bash ../.buildkite/download-images.sh
- pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py

- label: Llava Test
mirror_hardwares: [amd]
commands:
- bash ../.buildkite/download-images.sh
- pytest -v -s models/test_llava.py

- label: Prefix Caching Test
mirror_hardwares: [amd]
commands:
- pytest -v -s prefix_caching

- label: Samplers Test
command: pytest -v -s samplers

- label: LogitsProcessor Test
mirror_hardwares: [amd]
command: pytest -v -s test_logits_processor.py

- label: Worker Test
mirror_hardwares: [amd]
command: pytest -v -s worker

- label: Speculative decoding tests
mirror_hardwares: [amd]
command: pytest -v -s spec_decode

- label: LoRA Test %N
Expand All @@ -101,6 +119,7 @@ steps:

- label: Benchmarks
working_dir: "/vllm-workspace/.buildkite"
mirror_hardwares: [amd]
commands:
- pip install aiohttp
- bash run-benchmarks.sh
Expand Down
25 changes: 20 additions & 5 deletions .buildkite/test-template.j2
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,29 @@ steps:
limit: 5
- wait

- label: "AMD Test"
agents:
queue: amd
command: bash .buildkite/run-amd-test.sh
- group: "AMD Tests"
depends_on: ~
steps:
{% for step in steps %}
{% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
- label: "AMD: {{ step.label }}"
agents:
queue: amd
command: bash .buildkite/run-amd-test.sh "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'"
env:
DOCKER_BUILDKIT: "1"
{% endif %}
{% endfor %}

- label: "Neuron Test"
depends_on: ~
agents:
queue: neuron
command: bash .buildkite/run-neuron-test.sh
soft_fail: true

- label: "CPU Test"
- label: "Intel Test"
depends_on: ~
command: bash .buildkite/run-cpu-test.sh

{% for step in steps %}
Expand All @@ -44,6 +56,9 @@ steps:
plugins:
- kubernetes:
podSpec:
{% if step.num_gpus %}
priorityClassName: gpu-priority-cls-{{ step.num_gpus }}
{% endif %}
volumes:
- name: dshm
emptyDir:
Expand Down
49 changes: 49 additions & 0 deletions .github/ISSUE_TEMPLATE/750-RFC.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
name: 💬 Request for comments (RFC).
description: Ask for feedback on major architectural changes or design choices.
title: "[RFC]: "
labels: ["RFC"]

body:
- type: markdown
attributes:
value: >
#### Please take a look at previous [RFCs](https://github.com/vllm-project/vllm/issues?q=label%3ARFC+sort%3Aupdated-desc) for reference.
- type: textarea
attributes:
label: Motivation.
description: >
The motivation of the RFC.
validations:
required: true
- type: textarea
attributes:
label: Proposed Change.
description: >
The proposed change of the RFC.
validations:
required: true
- type: textarea
attributes:
label: Feedback Period.
description: >
The feedback period of the RFC. Usually at least one week.
validations:
required: false
- type: textarea
attributes:
label: CC List.
description: >
The list of people you want to CC.
validations:
required: false
- type: textarea
attributes:
label: Any Other Things.
description: >
Any other things you would like to mention.
validations:
required: false
- type: markdown
attributes:
value: >
Thanks for contributing 🎉!
6 changes: 5 additions & 1 deletion .github/actions/nm-lm-eval-accuracy/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ runs:
steps:
- id: lm-eval
run: |
# move source directories
mv vllm vllm-ignore || echo "no 'vllm' folder to move"
mv csrc csrc-ignore || echo "no 'csrc' folder to move"
COMMIT=${{ github.sha }}
VENV="${{ inputs.venv }}-${COMMIT:0:7}"
source $(pyenv root)/versions/${{ inputs.python }}/envs/${VENV}/bin/activate
Expand All @@ -20,7 +24,7 @@ runs:
pip3 install pytest openai==1.3.9
SUCCESS=0
pytest .github/scripts/test_lm_eval_sweep.py -s -v || SUCCESS=$?
pytest -v tests/accuracy/test_lm_eval_correctness.py || SUCCESS=$?
echo "test=${SUCCESS}" >> "$GITHUB_OUTPUT"
exit ${SUCCESS}
shell: bash
4 changes: 4 additions & 0 deletions .github/actions/nm-lm-eval-smoke/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ runs:
steps:
- id: lm-eval
run: |
# move source directories
mv vllm vllm-ignore || echo "no 'vllm' folder to move"
mv csrc csrc-ignore || echo "no 'csrc' folder to move"
COMMIT=${{ github.sha }}
VENV="${{ inputs.venv }}-${COMMIT:0:7}"
source $(pyenv root)/versions/${{ inputs.python }}/envs/${VENV}/bin/activate
Expand Down
5 changes: 5 additions & 0 deletions .github/data/nm_benchmark_weekly_configs_list.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
neuralmagic/benchmarks/configs/benchmark_serving.json
neuralmagic/benchmarks/configs/benchmark_throughput.json
neuralmagic/benchmarks/configs/benchmark_throughput_decode.json
neuralmagic/benchmarks/configs/benchmark_throughput_prefill.json
neuralmagic/benchmarks/configs/benchmark_remote_push.json
4 changes: 2 additions & 2 deletions .github/scripts/lm_eval_compare_hf_vs_vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def print_results(data_to_print: List = None,
def check_passing_score(results_dict: Dict = None,
alpha: float = None) -> bool:
for task in results_dict:
p_value = task["p_value"]
p_value = results_dict[task]["p_value"]
if p_value <= alpha:
return False
return True
Expand Down Expand Up @@ -120,6 +120,6 @@ def parse_args():
all_res[task1[0]] = {"z": z, "p_value": p_value}
print_results([results_hf["results"], results_vllm["results"]], all_res,
args.alpha)
if not check_passing_score:
if not check_passing_score(all_res, args.alpha):
print("Accuracy test failed!")
exit(1)
2 changes: 2 additions & 0 deletions .github/scripts/run-tests
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,8 @@ do
CUDA_VISIBLE_DEVICES=0,1 pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
elif [[ "${TEST}" == *"test_models_logprobs"* ]]; then
pytest --forked ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
elif [[ "${TEST}" == *"basic_correctness/test_preemption"* ]]; then
VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
else
pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
fi
Expand Down

1 comment on commit 8812e42

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

bigger_is_better

Benchmark suite Current: 8812e42 Previous: d485d3e Ratio
{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.2.0", "python_version": "3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]", "torch_version": "2.3.0+cu121"} 3.8388254207613244 prompts/s 3.8408865302828166 prompts/s 1.00
{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.2.0", "python_version": "3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]", "torch_version": "2.3.0+cu121"} 1474.1089615723486 tokens/s 1474.9004276286016 tokens/s 1.00

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.