当前环境
"image": "vllm/vllm-openai:latest"
"options": "runtime=nvidia gpus=1 ipc=host"
"--port=8000
--model=meta-llama/Meta-Llama-3-8B-Instruct
--tensor-parallel-size=1
--disable-log-requests
--enable-prefix-caching
--gpu-memory-utilization=1",
vllm 5.0.0-post-1
🐛 描述bug
运行以下查询:
'{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "prompt": ["The following are multiple choice questions (with answers) about abstract algebra.\\nFind all c in Z_3 such that Z_3[x]/(x^2 + c) is a field.\\nA. 0\\nB. 1\\nC. 2\\nD. 3\\nAnswer: B\\n\\nStatement 1 | If aH is an element of a factor group, then |aH| divides |a|. Statement 2 | If H and K are subgroups of G then HK is a subgroup of G.\\nA. True, True\\nB. False, False\\nC. True, False\\nD. False, True\\nAnswer: B\\n\\nStatement 1 | Every element of a group generates a cyclic subgroup of the group. Statement 2 | The symmetric group S_10 has 10 elements.\\nA. True, True\\nB. False, False\\nC. True, False\\nD. False, True\\nAnswer: C\\n\\nStatement 1| Every function from a finite set onto itself must be one to one. Statement 2 | Every subgroup of an abelian group is abelian.\\nA. True, True\\nB. False, False\\nC. True, False\\nD. False, True\\nAnswer: A\\n\\nFind the characteristic of the ring 2Z.\\nA. 0\\nB. 3\\nC. 12\\nD. 30\\nAnswer: A\\n\\nFind the degree for the given field extension Q(sqrt(2), sqrt(3), sqrt(18)) over Q.\\nA. 0\\nB. 4\\nC. 2\\nD. 6\\nAnswer: A"], "max_tokens": 1, "temperature": 0, "logprobs": 1, "echo": true}'
结果导致vllm崩溃,日志中有以下内容:
ERROR 06-26 06:20:13 async_llm_engine.py:52] Engine background task failed
ERROR 06-26 06:20:13 async_llm_engine.py:52] Traceback (most recent call last):
ERROR 06-26 06:20:13 async_llm_engine.py:52] File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 42, in _log_task_completion
ERROR 06-26 06:20:13 async_llm_engine.py:52] return_value = task.result()
ERROR 06-26 06:20:13 async_llm_engine.py:52] File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 532, in run_engine_loop
ERROR 06-26 06:20:13 async_llm_engine.py:52] has_requests_in_progress = await asyncio.wait_for(
ERROR 06-26 06:20:13 async_llm_engine.py:52] File "/usr/lib/python3.10/asyncio/tasks.py", line 445, in wait_for
ERROR 06-26 06:20:13 async_llm_engine.py:52] return fut.result()
ERROR 06-26 06:20:13 async_llm_engine.py:52] File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 506, in engine_step
ERROR 06-26 06:20:13 async_llm_engine.py:52] request_outputs = await self.engine.step_async()
ERROR 06-26 06:20:13 async_llm_engine.py:52] File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 235, in step_async
ERROR 06-26 06:20:13 async_llm_engine.py:52] output = await self.model_executor.execute_model_async(
ERROR 06-26 06:20:13 async_llm_engine.py:52] File "/usr/local/lib/python3.10/dist-packages/vllm/executor/gpu_executor.py", line 117, in execute_model_async
ERROR 06-26 06:20:13 async_llm_engine.py:52] output = await make_async(self.driver_worker.execute_model
ERROR 06-26 06:20:13 async_llm_engine.py:52] File "/usr/lib/python3.10/concurrent/futures/thread.py", line 58, in run
ERROR 06-26 06:20:13 async_llm_engine.py:52] result = self.fn(*self.args, **self.kwargs)
ERROR 06-26 06:20:13 async_llm_engine.py:52] File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
ERROR 06-26 06:20:13 async_llm_engine.py:52] return func(*args, **kwargs)
ERROR 06-26 06:20:13 async_llm_engine.py:52] File "/usr/local/lib/python3.10/dist-packages/vllm/worker/worker.py", line 280, in execute_model
ERROR 06-26 06:20:13 async_llm_engine.py:52] output = self.model_runner.execute_model(seq_group_metadata_list,
ERROR 06-26 06:20:13 async_llm_engine.py:52] File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
ERROR 06-26 06:20:13 async_llm_engine.py:52] return func(*args, **kwargs)
ERROR 06-26 06:20:13 async_llm_engine.py:52] File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 765, in execute_model
ERROR 06-26 06:20:13 async_llm_engine.py:52] output = self.model.sample(
ERROR 06-26 06:20:13 async_llm_engine.py:52] File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/llama.py", line 386, in sample
ERROR 06-26 06:20:13 async_llm_engine.py:52] next_tokens = self.sampler(logits, sampling_metadata)
ERROR 06-26 06:20:13 async_llm_engine.py:52] File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
ERROR 06-26 06:20:13 async_llm_engine.py:52] return self._call_impl(*args, **kwargs)
ERROR 06-26 06:20:13 async_llm_engine.py:52] File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1541, in _call_impl
ERROR 06-26 06:20:13 async_llm_engine.py:52] return forward_call(*args, **kwargs)
ERROR 06-26 06:20:13 async_llm_engine.py:52] File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/layers/sampler.py", line 112, in forward
ERROR 06-26 06:20:13 async_llm_engine.py:52] prompt_logprobs, sample_logprobs = _get_logprobs(
ERROR 06-26 06:20:13 async_llm_engine.py:52] File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/layers/sampler.py", line 760, in _get_logprobs
ERROR 06-26 06:20:13 async_llm_engine.py:52] assert len(next_token_ids) == len(query_indices)
ERROR 06-26 06:20:13 async_llm_engine.py:52] AssertionError
Exception in callback functools.partial(<function _log_task_completion at 0x7ff5f4d429e0>, error_callback=<bound method AsyncLLMEngine._error_callback of <vllm.engine.async_llm_engine.AsyncLLMEngine object at 0x7ff5dcdcf160>>)
handle: <Handle functools.partial(<function _log_task_completion at 0x7ff5f4d429e0>, error_callback=<bound method AsyncLLMEngine._error_callback of <vllm.engine.async_llm_engine.AsyncLLMEngine object at 0x7ff5dcdcf160>>)>
Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 42, in _log_task_completion
return_value = task.result()
File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 532, in run_engine_loop
has_requests_in_progress = await asyncio.wait_for(
File "/usr/lib/python3.10/asyncio/tasks.py", line 445, in wait_for
return fut.result()
File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 506, in engine_step
request_outputs = await self.engine.step_async()
File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 235, in step_async
output = await self.model_executor.execute_model_async(
File "/usr/local/lib/python3.10/dist-packages/vllm/executor/gpu_executor.py", line 117, in execute_model_async
output = await make_async(self.driver_worker.execute_model
File "/usr/lib/python3.10/concurrent/futures/thread.py", line 58, in run
result = self.fn(*self.args, **self.kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/vllm/worker/worker.py", line 280, in execute_model
output = self.model_runner.execute_model(seq_group_metadata_list,
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 765, in execute_model
output = self.model.sample(
File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/llama.py", line 386, in sample
next_tokens = self.sampler(logits, sampling_metadata)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/layers/sampler.py", line 112, in forward
prompt_logprobs, sample_logprobs = _get_logprobs(
File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/layers/sampler.py", line 760, in _get_logprobs
assert len(next_token_ids) == len(query_indices)
AssertionError
1条答案
按热度按时间uwopmtnx1#
请注意,这可能是#5344的重复。
暂时,您可以通过删除
--enable-prefix-caching
来避免此错误(尽管在足够长的上下文中仍然可能会出现OOM,但似乎正在进行改进的努力,例如:#5907)。