当前环境
Versions of relevant libraries:
[pip3] clip-anytorch==2.5.2
[pip3] mypy-extensions==1.0.0
[pip3] numpy==1.23.5
[pip3] onnx==1.14.0
[pip3] onnxruntime==1.16.3
[pip3] onnxruntime-gpu==1.16.3
[pip3] open-clip-torch==2.20.0
[pip3] pytorch-lightning==1.9.4
[pip3] torch==2.1.2
[pip3] torchdiffeq==0.2.3
[pip3] torchmetrics==0.11.4
[pip3] torchsde==0.2.6
[pip3] torchvision==0.15.1+cu118
[pip3] triton==2.1.0
[conda] clip-anytorch 2.5.2 pypi_0 pypi
[conda] numpy 1.23.5 pypi_0 pypi
[conda] open-clip-torch 2.20.0 pypi_0 pypi
[conda] pytorch-lightning 1.9.4 pypi_0 pypi
[conda] torch 2.1.2 pypi_0 pypi
[conda] torchdiffeq 0.2.3 pypi_0 pypi
[conda] torchmetrics 0.11.4 pypi_0 pypi
[conda] torchsde 0.2.6 pypi_0 pypi
[conda] torchvision 0.15.1+cu118 pypi_0 pypi
[conda] triton 2.1.0 pypi_0 pypiROCM Version: Could not collect
Neuron SDK Version: N/A
vLLM Version: 0.4.0.post1
🐛 描述bug
代码
from vllm import LLM, SamplingParams
prompts = [
"hello, who is you?",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM(model="TheBloke/Yarn-Mistral-7B-128k-GPTQ", )
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Genrate text: {generated_text!r}")
错误
INFO 04-18 15:38:45 selector.py:51] Cannot use FlashAttention because the package is not found. Please install it for better performance.
INFO 04-18 15:38:45 selector.py:25] Using XFormers backend.
INFO 04-18 15:38:47 model_runner.py:104] Loading model weights took 3.9080 GB
INFO 04-18 15:38:55 gpu_executor.py:94] # GPU blocks: 1761, # CPU blocks: 2048
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[1], line 11
6 prompts = [
7 "hello, who is you?",
8 ]
10 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
---> 11 llm = LLM(model="TheBloke/Yarn-Mistral-7B-128k-GPTQ", max_model_len=131072, gpu_memory_utilization=1 )
12 outputs = llm.generate(prompts, sampling_params)
14 for output in outputs:
File ~/miniconda3/lib/python3.10/site-packages/vllm/entrypoints/llm.py:112, in LLM.__init__(self, model, tokenizer, tokenizer_mode, trust_remote_code, tensor_parallel_size, dtype, quantization, revision, tokenizer_revision, seed, gpu_memory_utilization, swap_space, enforce_eager, max_context_len_to_capture, disable_custom_all_reduce, **kwargs)
93 kwargs["disable_log_stats"] = True
94 engine_args = EngineArgs(
95 model=model,
96 tokenizer=tokenizer,
(...)
110 **kwargs,
111 )
--> 112 self.llm_engine = LLMEngine.from_engine_args(
113 engine_args, usage_context=UsageContext.LLM_CLASS)
114 self.request_counter = Counter()
File ~/miniconda3/lib/python3.10/site-packages/vllm/engine/llm_engine.py:196, in LLMEngine.from_engine_args(cls, engine_args, usage_context)
193 executor_class = GPUExecutor
195 # Create the LLM engine.
--> 196 engine = cls(
197 *engine_configs,
198 executor_class=executor_class,
199 log_stats=not engine_args.disable_log_stats,
200 usage_context=usage_context,
201 )
202 return engine
File ~/miniconda3/lib/python3.10/site-packages/vllm/engine/llm_engine.py:110, in LLMEngine.__init__(self, model_config, cache_config, parallel_config, scheduler_config, device_config, lora_config, vision_language_config, executor_class, log_stats, usage_context)
107 self.detokenizer = Detokenizer(self.tokenizer)
108 self.seq_counter = Counter()
--> 110 self.model_executor = executor_class(model_config, cache_config,
111 parallel_config, scheduler_config,
112 device_config, lora_config,
113 vision_language_config)
115 # If usage stat is enabled, collect relevant info.
116 if is_usage_stats_enabled():
File ~/miniconda3/lib/python3.10/site-packages/vllm/executor/gpu_executor.py:40, in GPUExecutor.__init__(self, model_config, cache_config, parallel_config, scheduler_config, device_config, lora_config, vision_language_config)
37 self._init_worker()
39 # Profile the memory usage and initialize the cache.
---> 40 self._init_cache()
File ~/miniconda3/lib/python3.10/site-packages/vllm/executor/gpu_executor.py:97, in GPUExecutor._init_cache(self)
92 num_gpu_blocks = forced_num_gpu_blocks
94 logger.info(f"# GPU blocks: {num_gpu_blocks}, "
95 f"# CPU blocks: {num_cpu_blocks}")
---> 97 check_block_size_valid(num_gpu_blocks, self.cache_config.block_size,
98 self.model_config.max_model_len)
100 self.cache_config.num_gpu_blocks = num_gpu_blocks
101 self.cache_config.num_cpu_blocks = num_cpu_blocks
File ~/miniconda3/lib/python3.10/site-packages/vllm/executor/utils.py:8, in check_block_size_valid(num_gpu_blocks, block_size, max_model_len)
6 max_seq_len = block_size * num_gpu_blocks
7 if max_model_len > max_seq_len:
----> 8 raise ValueError(
9 f"The model's max seq len ({max_model_len}) "
10 "is larger than the maximum number of tokens that can be "
11 f"stored in KV cache ({max_seq_len}). Try increasing "
12 "`gpu_memory_utilization` or decreasing `max_model_len` when "
13 "initializing the engine.")
ValueError: The model's max seq len (131072) is larger than the maximum number of tokens that can be stored in KV cache (28176). Try increasing `gpu_memory_utilization` or decreasing `max_model_len` when initializing the engine.
132000
1条答案
按热度按时间vwoqyblh1#
你好,我认为4090无法完全支持128k的上下文长度...