vllm 当我设置tensor_parallel_size=2时,发生了一个时间错误,

jfgube3f  于 3个月前  发布在  其他
关注(0)|答案(2)|浏览(54)

根据您提供的信息,问题出现在设置tensor_parallel_size参数时。当您将其设置为2时,出现了一个运行时错误。错误信息如下:

RuntimeError: ProcessGroupNCCL is only supported with GPUs, no GPUs found!
 Warning: CUDA initialization Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 103: intergrity checks failed (function operator())

这个错误表明,您的环境不支持使用GPU进行NCCL进程组操作。为了解决这个问题,您可以尝试以下方法:

  1. 确保您的系统上安装了支持的NVIDIA显卡驱动程序。
  2. 检查您的CUDA和cuDNN版本是否与您的TensorFlow版本兼容。您可以在TensorFlow官方文档中查看支持的版本。
  3. 如果您的系统上有多个GPU,请确保在设置tensor_parallel_size参数时指定正确的GPU设备索引。例如,如果您有两个GPU(索引为0和1),则应将tensor_parallel_size设置为2。

另外,您提到的代码片段似乎是一个FastAPI应用的一部分。为了更好地帮助您解决问题,我需要更多关于这个应用的上下文信息,特别是与模型加载、输入处理和输出生成相关的部分。

ModelCard(id=served_model,
 root=served_model,
 permission=[ModelPermission()])
 ]
 return ModelList(data=model_cards)
def create_logprobs(token_ids: List[int],
 id_logprobs: List[Dict[int, float]],
 initial_text_offset: int = 0) -> LogProbs:
 """Create OpenAI-style logprobs."""
 logprobs = LogProbs()
 last_token_len = 0
 for token_id, id_logprob in zip(token_ids, id_logprobs):
 token = tokenizer.convert_ids_to_tokens(token_id)
 logprobs.tokens.append(token)
 logprobs.token_logprobs.append(id_logprob[token_id])
 if len(logprobs.text_offset) == 0:
 logprobs.text_offset.append(initial_text_offset)
 else:
 logprobs.text_offset.append(logprobs.text_offset[-1] +
 last_token_len)
 last_token_len = len(token)

logprobs.top_logprobs.append({
tokenizer.convert_ids_to_tokens(i): p
for i, p in id_logprob.items()
})
return logprobs


@app.post("/v1/chat/completions")
 async def create_chat_completion(request: ChatCompletionRequest,
 raw_request: Request):

global MODEL_type # 模型名
global MODEL_PATH # 模型路径
model_name = MODEL_PATH
model_Type = MODEL_TYPE # 记录模型名

logger.info(f"Received chat completion request: {request}")

error_check_ret = await check_model(request)
if error_check_ret is not None:
return error_check_ret

if request.logit_bias is not None and len(request.logit_bias) > 0:
# TODO: support logit_bias in vLLM engine.
return create_error_response(HTTPStatus.BAD_REQUEST,
"logit_bias is not currently supported")

prompt = await get_gen_prompt(request)

def create_stream_response_json(
index: int,
text: str,
finish_reason: Optional[str] = None,
) -> str:
choice_data = ChatCompletionResponseStreamChoice(
index=index,
delta=DeltaMessage(content=text),
finish_reason=finish_reason,
)
response = ChatCompletionStreamResponse(
# id=request_id,
# created=created_time,
# model=model_name,
model = model_Type, # 返回模型名
choices=[choice_data],
)
response_json = response.json(ensure_ascii=False)

return response_json

token_ids, error_check_ret = await check_length(request, prompt=prompt)
if error_check_ret is not None:
return error_check_ret

request_id = f"cmpl-{random_uuid()}"
created_time = int(time.monotonic())
try:
sampling_params = SamplingParams(
n=request.n,
presence_penalty=request.presence_penalty,
frequency_penalty=request.frequency_penalty,
temperature=request.temperature,
top_p=request.top_p,
stop=request.stop,
stop_token_ids=request.stop_token_ids,
max_tokens=request.max_new_tokens,
best_of=request.best_of,
top_k=request.top_k,
ignore_eos=request.ignore_eos,
use_beam_search=request.use_beam_search,
skip_special_tokens=request.skip_special_tokens,
)
except ValueError as e:
return create_error_response(HTTPStatus.BAD_REQUEST, str(e))
result_generator = engine.generate(prompt, sampling_params, request_id,
token_ids)

async def completion_stream_generator() -> AsyncGenerator[str, None]:
# First chunk with role
for i in range(request.n):
choice_data = ChatCompletionResponseStreamChoice(
index=i,
delta=DeltaMessage(role="assistant"),
finish_reason=None,
)
chunk = ChatCompletionStreamResponse(object="chat.completion.chunk",
id=request_id,
choices=[choice_data],
model=model_Type)
data = chunk.json(exclude_unset=True, ensure_ascii=False)
yield f"data: {data}\n\n"

previous_texts = [""] * request.n
previous_num_tokens = [0] * request.n
async for res in result_generator:
    res: RequestOutput
    for output in res.outputs:
        i = output.index
        delta_text = output.text[len(previous_texts[i]):]
        previous_texts[i] = output.text
        previous_num_tokens[i] = len(output.token_ids)
        response_json = create_stream_response_json(
            index=i,
            text=delta_text.encode("UTF-8").decode("UTF-8"),
        )
        yield f"data: {response_json}\n\n"
        if output.finish_reason is not None:
            response_json = create_stream_response_json(
                index=i,
                text="",
                finish_reason=output.finish_reason,
            )
            yield f"data: {response_json}\n\n"
yield "data: [DONE]\n\n"

if request.stream or request.use_stream_chat:
return StreamingResponse(completion_stream_generator(),
media_type="text/event-stream")

final_res: RequestOutput = None

choices = [] # 新代码

async for res in result_generator:
if await raw_request.is_disconnected():
# Abort the request if the client disconnects.
await engine.abort(request_id)
return create_error_response(HTTPStatus.BAD_REQUEST,
"Client disconnected")
# 新代码
# for output in res.outputs:
# choice_data = ChatCompletionResponseChoice(
# index=output.index,
# message=ChatMessage(role="assistant", content=output.text),
# finish_reason=output.finish_reason,
# )
# choices.append(choice_data)
final_res = res

assert final_res is not None
choices = []
for output in final_res.outputs:
choice_data = ChatCompletionResponseChoice(
index=output.index,
message=ChatMessage(role="assistant", content=output.text),
finish_reason=output.finish_reason,
)
choices.append(choice_data)

choice = choices[0]

contents = [choice.message.content for choice in choices]
content_list= [content_chuli(content) for content in contents]
content2=None

return {"response": choices}

return_content = []
for content in content_list:
count = 0 # 新加代码1
if (("抱歉" or "sorry") in " ".join(manual_tokenize(content)[:20])) or (len(manual_tokenize(content)) <= len(manual_tokenize(prompt))*0.005):
count += 1 # 新加代码2
try:
sampling_params = SamplingParams(
n=1,
presence_penalty=request.presence_penalty,
frequency_penalty=request.frequency_penalty,
temperature=1,
top_p=request.top_p,
stop=request.stop,
stop_token_ids=request.stop_token_ids,
max_tokens=request.max_new_tokens,
best_of=request.best_of,
top_k=50,
ignore_eos=request.ignore_eos,
use_beam_search=request.use_beam_search,
skip_special_tokens=request.skip_special_tokens,
)
except ValueError as e:
return create_error_response(HTTPStatus.BAD_REQUEST, str(e))

    pattern = r'(<\|im_start\|>system \n).*?(<\|im_end\|>)'  
    prompt = re.sub(pattern, r'\1You are a helpful assistant.\2', prompt)
    result_generator = engine.generate(prompt, sampling_params, request_id,
                            token_ids)
    final_res: RequestOutput = None
    async for res in result_generator:
        if await raw_request.is_disconnected():
            # Abort the request if the client disconnects.
            await engine.abort(request_id)
            return create_error_response(HTTPStatus.BAD_REQUEST,
                                        "Client disconnected")
        final_res = res
    assert final_res is not None
    choices = []
    for output in final_res.outputs:
        choice_data = ChatCompletionResponseChoice(
            index=output.index,
            message=ChatMessage(role="assistant", content=output.text),
            finish_reason=output.finish_reason,
        )
        choices.append(choice_data)
    choice = choices[0]
    content = choice.message.content
    content2= content_chuli(content) 
    content2 = content
# 如果第二次调用结果没有第一次好,content使用第一次
if content2:                                              
    if len(content2)<len(content):
        content=content
    else:
        content=content2
else:
    content = content  # 新加代码
if request.history != "":
    History = request.history
else:
    History = []
print(f"content: {content}")
logger.info(f"content: {content}")
return_content.append({"response": content, "history": History, "count": count, "length": len(content)}) 

return return_content


if __name__ == "__main__":
 parser = argparse.ArgumentParser(
 description="vLLM OpenAI-Compatible RESTful API server.")
 parser.add_argument("--host", type=str, default=None, help="host name")
 parser.add_argument("--port", type=int, default=8098, help="port number")
 parser.add_argument("--allow-credentials",
 action="store_true",
 help="allow credentials")
 parser.add_argument("--allowed-origins",
 type=json.loads,
 default=["*"],
 help="allowed origins")
 parser.add_argument("--allowed-methods",
 type=json.loads,
 default=["*"],
 help="allowed methods")
 parser.add_argument("--allowed-headers",
 type=json.loads,
 default=["*"],
 help="allowed headers")
 parser.add_argument("--served-model-name",
 type=str,
 default=None,
 help="The model name used in the API. If not "
 "specified, the model name will be the same as "
 "the huggingface name.")
 parser.add_argument("--model_type",
 type=str,
 default=None,
 help="The model name in huggingface.")
 parser.add_argument("--tensor_parallel_size", type=int, default=1, help="number of gpus to use")
 parser.add_argument("--gpu_memory_utilization", type=float, default=0.90)
 parser = AsyncEngineArgs.add_cli_args(parser)
 args = parser.parse_args()

logger.info(f"args: {args}")

global MODEL_TYPE
global MODEL_PATH
MODEL_TYPE = args.model_type
MODEL_PATH = args.model

if args.served_model_name is not None:
served_model = args.served_model_name
else:
served_model = args.model

engine_args = AsyncEngineArgs.from_cli_args(args)
engine = AsyncLLMEngine.from_engine_args(engine_args)
engine_model_config = asyncio.run(engine.get_model_config())
max_model_len = engine_model_config.max_model_len

A separate tokenizer to map token IDs to strings.

tokenizer = get_tokenizer(engine_args.tokenizer,
tokenizer_mode=engine_args.tokenizer_mode,
trust_remote_code=engine_args.trust_remote_code)

"""uvicorn日志打印中加上当前时间"""
def converter(*args):
cn_tz = timezone(timedelta(hours=8), name='Asia/Shanghai')
dt = datetime.now(cn_tz)
return dt.timetuple() # 返回一个tuple
logging.Formatter.converter = converter
log_config = uvicorn.config.LOGGING_CONFIG
log_config["formatters"]["access"]["fmt"] = ' %(levelprefix)s %(asctime)s - %(client_addr)s - "%(request_line)s" - %(status_code)s '
log_config["formatters"]["default"]["fmt"] = " %(levelprefix)s %(asctime)s %(message)s "
uvicorn.run(app,
host=args.host,
port=args.port,
log_level="info",
timeout_keep_alive=TIMEOUT_KEEP_ALIVE)

vulvrdjw

vulvrdjw1#

tensor_parallel_size=2被使用时,输出结果为:


![](//img.saoniuhuo.com/images/202408/5801723537227063.jpg)
wlp8pajw

wlp8pajw2#

我曾经在GCP上遇到过这个问题,使用的是vLLM的旧版本。后来发现ray在GCP上检测GPU时出现了错误。这个问题已经在几周前修复了。你可以尝试使用vLLM的最新版本吗?因为你正在使用的是0.2.1版本的vLLM,这个版本已经比较旧了。

相关问题