You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I used the squeezeLLM 4bit to quant my model. While it seems that there is a bug.
I just follow the squeezeLLM steps:
python chunk_models.py --model [MODEL_PATH] --output [MODEL_CHUNKS_PATH] --model_type llama
python chunk_models.py --model [GRADIENT_PATH] --output [GRADIENT_CHUNKS_PATH] --model_type llama
[rank0]: Traceback (most recent call last):
[rank0]: File "/home/ryan/vllm/benchmarks/benchmark_latency.py", line 195, in
[rank0]: main(args)
[rank0]: File "/home/ryan/vllm/benchmarks/benchmark_latency.py", line 20, in main
[rank0]: llm = LLM(model=args.model,
[rank0]: ^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/ryan/vllm/vllm/entrypoints/llm.py", line 123, in init
[rank0]: self.llm_engine = LLMEngine.from_engine_args(
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/ryan/vllm/vllm/engine/llm_engine.py", line 292, in from_engine_args
[rank0]: engine = cls(
[rank0]: ^^^^
[rank0]: File "/home/ryan/vllm/vllm/engine/llm_engine.py", line 160, in init
[rank0]: self.model_executor = executor_class(
[rank0]: ^^^^^^^^^^^^^^^
[rank0]: File "/home/ryan/vllm/vllm/executor/executor_base.py", line 41, in init
[rank0]: self._init_executor()
[rank0]: File "/home/ryan/vllm/vllm/executor/gpu_executor.py", line 23, in _init_executor
[rank0]: self._init_non_spec_worker()
[rank0]: File "/home/ryan/vllm/vllm/executor/gpu_executor.py", line 69, in _init_non_spec_worker
[rank0]: self.driver_worker.load_model()
[rank0]: File "/home/ryan/vllm/vllm/worker/worker.py", line 118, in load_model
[rank0]: self.model_runner.load_model()
[rank0]: File "/home/ryan/vllm/vllm/worker/model_runner.py", line 164, in load_model
[rank0]: self.model = get_model(
[rank0]: ^^^^^^^^^^
[rank0]: File "/home/ryan/vllm/vllm/model_executor/model_loader/init.py", line 19, in get_model
[rank0]: return loader.load_model(model_config=model_config,
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/ryan/vllm/vllm/model_executor/model_loader/loader.py", line 224, in load_model
[rank0]: model.load_weights(
[rank0]: File "/home/ryan/vllm/vllm/model_executor/models/llama.py", line 407, in load_weights
[rank0]: param = params_dict[name]
[rank0]: ~~~~~~~~~~~^^^^^^
[rank0]: KeyError: 'model.layers.0.self_attn.qkv_proj.rows'
after I changed the code llama.py in
if name != "model.layers.0.self_attn.qkv_proj.rows":
param = params_dict[name]
else:
param = params_dict["model.layers.0.self_attn.qkv_proj.qweight"]
Still get a gut
[rank0]: Traceback (most recent call last):
[rank0]: File "/home/ryan/vllm/benchmarks/benchmark_latency.py", line 195, in
[rank0]: main(args)
[rank0]: File "/home/ryan/vllm/benchmarks/benchmark_latency.py", line 20, in main
[rank0]: llm = LLM(model=args.model,
[rank0]: ^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/ryan/vllm/vllm/entrypoints/llm.py", line 123, in init
[rank0]: self.llm_engine = LLMEngine.from_engine_args(
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/ryan/vllm/vllm/engine/llm_engine.py", line 292, in from_engine_args
[rank0]: engine = cls(
[rank0]: ^^^^
[rank0]: File "/home/ryan/vllm/vllm/engine/llm_engine.py", line 160, in init
[rank0]: self.model_executor = executor_class(
[rank0]: ^^^^^^^^^^^^^^^
[rank0]: File "/home/ryan/vllm/vllm/executor/executor_base.py", line 41, in init
[rank0]: self._init_executor()
[rank0]: File "/home/ryan/vllm/vllm/executor/gpu_executor.py", line 23, in _init_executor
[rank0]: self._init_non_spec_worker()
[rank0]: File "/home/ryan/vllm/vllm/executor/gpu_executor.py", line 69, in _init_non_spec_worker
[rank0]: self.driver_worker.load_model()
[rank0]: File "/home/ryan/vllm/vllm/worker/worker.py", line 118, in load_model
[rank0]: self.model_runner.load_model()
[rank0]: File "/home/ryan/vllm/vllm/worker/model_runner.py", line 164, in load_model
[rank0]: self.model = get_model(
[rank0]: ^^^^^^^^^^
[rank0]: File "/home/ryan/vllm/vllm/model_executor/model_loader/init.py", line 19, in get_model
[rank0]: return loader.load_model(model_config=model_config,
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/ryan/vllm/vllm/model_executor/model_loader/loader.py", line 224, in load_model
[rank0]: model.load_weights(
[rank0]: File "/home/ryan/vllm/vllm/model_executor/models/llama.py", line 412, in load_weights
[rank0]: weight_loader(param, loaded_weight, shard_id)
[rank0]: File "/home/ryan/vllm/vllm/model_executor/layers/linear.py", line 561, in weight_loader
[rank0]: loaded_weight = loaded_weight.narrow(output_dim, start_idx,
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)
The text was updated successfully, but these errors were encountered:
Hi @RyanWMHI squeezeLLM isn't a commonly used quantization backend in vLLM, we generally recommend GPTQ or AWQ now. My assumption is we won't support any advanced configurations, like sparse storage
Your current environment
python 3.11, vllm 0.4.2
馃悰 Describe the bug
I used the squeezeLLM 4bit to quant my model. While it seems that there is a bug.
I just follow the squeezeLLM steps:
python chunk_models.py --model [MODEL_PATH] --output [MODEL_CHUNKS_PATH] --model_type llama
python chunk_models.py --model [GRADIENT_PATH] --output [GRADIENT_CHUNKS_PATH] --model_type llama
python generate_outlier_config.py --model [MODEL_CHUNKS_PATH] --range [RANGE] --output [OUTLIERS_CONFIG_PATH]
python nuq.py --bit 4 --model_type llama --model [MODEL_CHUNKS_PATH] --gradient [GRADIENT_CHUNKS_PATH] --output [LUT_PATH]
python pack.py --model [MODEL_PATH] --wbits 4 --folder [LUT_PATH] --save [PACKED_CKPT_PATH] --include_sparse --balance
[rank0]: Traceback (most recent call last):
[rank0]: File "/home/ryan/vllm/benchmarks/benchmark_latency.py", line 195, in
[rank0]: main(args)
[rank0]: File "/home/ryan/vllm/benchmarks/benchmark_latency.py", line 20, in main
[rank0]: llm = LLM(model=args.model,
[rank0]: ^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/ryan/vllm/vllm/entrypoints/llm.py", line 123, in init
[rank0]: self.llm_engine = LLMEngine.from_engine_args(
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/ryan/vllm/vllm/engine/llm_engine.py", line 292, in from_engine_args
[rank0]: engine = cls(
[rank0]: ^^^^
[rank0]: File "/home/ryan/vllm/vllm/engine/llm_engine.py", line 160, in init
[rank0]: self.model_executor = executor_class(
[rank0]: ^^^^^^^^^^^^^^^
[rank0]: File "/home/ryan/vllm/vllm/executor/executor_base.py", line 41, in init
[rank0]: self._init_executor()
[rank0]: File "/home/ryan/vllm/vllm/executor/gpu_executor.py", line 23, in _init_executor
[rank0]: self._init_non_spec_worker()
[rank0]: File "/home/ryan/vllm/vllm/executor/gpu_executor.py", line 69, in _init_non_spec_worker
[rank0]: self.driver_worker.load_model()
[rank0]: File "/home/ryan/vllm/vllm/worker/worker.py", line 118, in load_model
[rank0]: self.model_runner.load_model()
[rank0]: File "/home/ryan/vllm/vllm/worker/model_runner.py", line 164, in load_model
[rank0]: self.model = get_model(
[rank0]: ^^^^^^^^^^
[rank0]: File "/home/ryan/vllm/vllm/model_executor/model_loader/init.py", line 19, in get_model
[rank0]: return loader.load_model(model_config=model_config,
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/ryan/vllm/vllm/model_executor/model_loader/loader.py", line 224, in load_model
[rank0]: model.load_weights(
[rank0]: File "/home/ryan/vllm/vllm/model_executor/models/llama.py", line 407, in load_weights
[rank0]: param = params_dict[name]
[rank0]: ~~~~~~~~~~~^^^^^^
[rank0]: KeyError: 'model.layers.0.self_attn.qkv_proj.rows'
after I changed the code llama.py in
if name != "model.layers.0.self_attn.qkv_proj.rows":
param = params_dict[name]
else:
param = params_dict["model.layers.0.self_attn.qkv_proj.qweight"]
Still get a gut
[rank0]: Traceback (most recent call last):
[rank0]: File "/home/ryan/vllm/benchmarks/benchmark_latency.py", line 195, in
[rank0]: main(args)
[rank0]: File "/home/ryan/vllm/benchmarks/benchmark_latency.py", line 20, in main
[rank0]: llm = LLM(model=args.model,
[rank0]: ^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/ryan/vllm/vllm/entrypoints/llm.py", line 123, in init
[rank0]: self.llm_engine = LLMEngine.from_engine_args(
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/ryan/vllm/vllm/engine/llm_engine.py", line 292, in from_engine_args
[rank0]: engine = cls(
[rank0]: ^^^^
[rank0]: File "/home/ryan/vllm/vllm/engine/llm_engine.py", line 160, in init
[rank0]: self.model_executor = executor_class(
[rank0]: ^^^^^^^^^^^^^^^
[rank0]: File "/home/ryan/vllm/vllm/executor/executor_base.py", line 41, in init
[rank0]: self._init_executor()
[rank0]: File "/home/ryan/vllm/vllm/executor/gpu_executor.py", line 23, in _init_executor
[rank0]: self._init_non_spec_worker()
[rank0]: File "/home/ryan/vllm/vllm/executor/gpu_executor.py", line 69, in _init_non_spec_worker
[rank0]: self.driver_worker.load_model()
[rank0]: File "/home/ryan/vllm/vllm/worker/worker.py", line 118, in load_model
[rank0]: self.model_runner.load_model()
[rank0]: File "/home/ryan/vllm/vllm/worker/model_runner.py", line 164, in load_model
[rank0]: self.model = get_model(
[rank0]: ^^^^^^^^^^
[rank0]: File "/home/ryan/vllm/vllm/model_executor/model_loader/init.py", line 19, in get_model
[rank0]: return loader.load_model(model_config=model_config,
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/ryan/vllm/vllm/model_executor/model_loader/loader.py", line 224, in load_model
[rank0]: model.load_weights(
[rank0]: File "/home/ryan/vllm/vllm/model_executor/models/llama.py", line 412, in load_weights
[rank0]: weight_loader(param, loaded_weight, shard_id)
[rank0]: File "/home/ryan/vllm/vllm/model_executor/layers/linear.py", line 561, in weight_loader
[rank0]: loaded_weight = loaded_weight.narrow(output_dim, start_idx,
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)
The text was updated successfully, but these errors were encountered: