根据您提供的信息,问题出现在设置tensor_parallel_size
参数时。当您将其设置为2时,出现了一个运行时错误。错误信息如下:
RuntimeError: ProcessGroupNCCL is only supported with GPUs, no GPUs found!
Warning: CUDA initialization Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 103: intergrity checks failed (function operator())
这个错误表明,您的环境不支持使用GPU进行NCCL进程组操作。为了解决这个问题,您可以尝试以下方法:
- 确保您的系统上安装了支持的NVIDIA显卡驱动程序。
- 检查您的CUDA和cuDNN版本是否与您的TensorFlow版本兼容。您可以在TensorFlow官方文档中查看支持的版本。
- 如果您的系统上有多个GPU,请确保在设置
tensor_parallel_size
参数时指定正确的GPU设备索引。例如,如果您有两个GPU(索引为0和1),则应将tensor_parallel_size
设置为2。
另外,您提到的代码片段似乎是一个FastAPI应用的一部分。为了更好地帮助您解决问题,我需要更多关于这个应用的上下文信息,特别是与模型加载、输入处理和输出生成相关的部分。
ModelCard(id=served_model,
root=served_model,
permission=[ModelPermission()])
]
return ModelList(data=model_cards)
def create_logprobs(token_ids: List[int],
id_logprobs: List[Dict[int, float]],
initial_text_offset: int = 0) -> LogProbs:
"""Create OpenAI-style logprobs."""
logprobs = LogProbs()
last_token_len = 0
for token_id, id_logprob in zip(token_ids, id_logprobs):
token = tokenizer.convert_ids_to_tokens(token_id)
logprobs.tokens.append(token)
logprobs.token_logprobs.append(id_logprob[token_id])
if len(logprobs.text_offset) == 0:
logprobs.text_offset.append(initial_text_offset)
else:
logprobs.text_offset.append(logprobs.text_offset[-1] +
last_token_len)
last_token_len = len(token)
logprobs.top_logprobs.append({
tokenizer.convert_ids_to_tokens(i): p
for i, p in id_logprob.items()
})
return logprobs
@app.post("/v1/chat/completions")
async def create_chat_completion(request: ChatCompletionRequest,
raw_request: Request):
global MODEL_type # 模型名
global MODEL_PATH # 模型路径
model_name = MODEL_PATH
model_Type = MODEL_TYPE # 记录模型名
logger.info(f"Received chat completion request: {request}")
error_check_ret = await check_model(request)
if error_check_ret is not None:
return error_check_ret
if request.logit_bias is not None and len(request.logit_bias) > 0:
# TODO: support logit_bias in vLLM engine.
return create_error_response(HTTPStatus.BAD_REQUEST,
"logit_bias is not currently supported")
prompt = await get_gen_prompt(request)
def create_stream_response_json(
index: int,
text: str,
finish_reason: Optional[str] = None,
) -> str:
choice_data = ChatCompletionResponseStreamChoice(
index=index,
delta=DeltaMessage(content=text),
finish_reason=finish_reason,
)
response = ChatCompletionStreamResponse(
# id=request_id,
# created=created_time,
# model=model_name,
model = model_Type, # 返回模型名
choices=[choice_data],
)
response_json = response.json(ensure_ascii=False)
return response_json
token_ids, error_check_ret = await check_length(request, prompt=prompt)
if error_check_ret is not None:
return error_check_ret
request_id = f"cmpl-{random_uuid()}"
created_time = int(time.monotonic())
try:
sampling_params = SamplingParams(
n=request.n,
presence_penalty=request.presence_penalty,
frequency_penalty=request.frequency_penalty,
temperature=request.temperature,
top_p=request.top_p,
stop=request.stop,
stop_token_ids=request.stop_token_ids,
max_tokens=request.max_new_tokens,
best_of=request.best_of,
top_k=request.top_k,
ignore_eos=request.ignore_eos,
use_beam_search=request.use_beam_search,
skip_special_tokens=request.skip_special_tokens,
)
except ValueError as e:
return create_error_response(HTTPStatus.BAD_REQUEST, str(e))
result_generator = engine.generate(prompt, sampling_params, request_id,
token_ids)
async def completion_stream_generator() -> AsyncGenerator[str, None]:
# First chunk with role
for i in range(request.n):
choice_data = ChatCompletionResponseStreamChoice(
index=i,
delta=DeltaMessage(role="assistant"),
finish_reason=None,
)
chunk = ChatCompletionStreamResponse(object="chat.completion.chunk",
id=request_id,
choices=[choice_data],
model=model_Type)
data = chunk.json(exclude_unset=True, ensure_ascii=False)
yield f"data: {data}\n\n"
previous_texts = [""] * request.n
previous_num_tokens = [0] * request.n
async for res in result_generator:
res: RequestOutput
for output in res.outputs:
i = output.index
delta_text = output.text[len(previous_texts[i]):]
previous_texts[i] = output.text
previous_num_tokens[i] = len(output.token_ids)
response_json = create_stream_response_json(
index=i,
text=delta_text.encode("UTF-8").decode("UTF-8"),
)
yield f"data: {response_json}\n\n"
if output.finish_reason is not None:
response_json = create_stream_response_json(
index=i,
text="",
finish_reason=output.finish_reason,
)
yield f"data: {response_json}\n\n"
yield "data: [DONE]\n\n"
if request.stream or request.use_stream_chat:
return StreamingResponse(completion_stream_generator(),
media_type="text/event-stream")
final_res: RequestOutput = None
choices = [] # 新代码
async for res in result_generator:
if await raw_request.is_disconnected():
# Abort the request if the client disconnects.
await engine.abort(request_id)
return create_error_response(HTTPStatus.BAD_REQUEST,
"Client disconnected")
# 新代码
# for output in res.outputs:
# choice_data = ChatCompletionResponseChoice(
# index=output.index,
# message=ChatMessage(role="assistant", content=output.text),
# finish_reason=output.finish_reason,
# )
# choices.append(choice_data)
final_res = res
assert final_res is not None
choices = []
for output in final_res.outputs:
choice_data = ChatCompletionResponseChoice(
index=output.index,
message=ChatMessage(role="assistant", content=output.text),
finish_reason=output.finish_reason,
)
choices.append(choice_data)
choice = choices[0]
contents = [choice.message.content for choice in choices]
content_list= [content_chuli(content) for content in contents]
content2=None
return {"response": choices}
return_content = []
for content in content_list:
count = 0 # 新加代码1
if (("抱歉" or "sorry") in " ".join(manual_tokenize(content)[:20])) or (len(manual_tokenize(content)) <= len(manual_tokenize(prompt))*0.005):
count += 1 # 新加代码2
try:
sampling_params = SamplingParams(
n=1,
presence_penalty=request.presence_penalty,
frequency_penalty=request.frequency_penalty,
temperature=1,
top_p=request.top_p,
stop=request.stop,
stop_token_ids=request.stop_token_ids,
max_tokens=request.max_new_tokens,
best_of=request.best_of,
top_k=50,
ignore_eos=request.ignore_eos,
use_beam_search=request.use_beam_search,
skip_special_tokens=request.skip_special_tokens,
)
except ValueError as e:
return create_error_response(HTTPStatus.BAD_REQUEST, str(e))
pattern = r'(<\|im_start\|>system \n).*?(<\|im_end\|>)'
prompt = re.sub(pattern, r'\1You are a helpful assistant.\2', prompt)
result_generator = engine.generate(prompt, sampling_params, request_id,
token_ids)
final_res: RequestOutput = None
async for res in result_generator:
if await raw_request.is_disconnected():
# Abort the request if the client disconnects.
await engine.abort(request_id)
return create_error_response(HTTPStatus.BAD_REQUEST,
"Client disconnected")
final_res = res
assert final_res is not None
choices = []
for output in final_res.outputs:
choice_data = ChatCompletionResponseChoice(
index=output.index,
message=ChatMessage(role="assistant", content=output.text),
finish_reason=output.finish_reason,
)
choices.append(choice_data)
choice = choices[0]
content = choice.message.content
content2= content_chuli(content)
content2 = content
# 如果第二次调用结果没有第一次好,content使用第一次
if content2:
if len(content2)<len(content):
content=content
else:
content=content2
else:
content = content # 新加代码
if request.history != "":
History = request.history
else:
History = []
print(f"content: {content}")
logger.info(f"content: {content}")
return_content.append({"response": content, "history": History, "count": count, "length": len(content)})
return return_content
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="vLLM OpenAI-Compatible RESTful API server.")
parser.add_argument("--host", type=str, default=None, help="host name")
parser.add_argument("--port", type=int, default=8098, help="port number")
parser.add_argument("--allow-credentials",
action="store_true",
help="allow credentials")
parser.add_argument("--allowed-origins",
type=json.loads,
default=["*"],
help="allowed origins")
parser.add_argument("--allowed-methods",
type=json.loads,
default=["*"],
help="allowed methods")
parser.add_argument("--allowed-headers",
type=json.loads,
default=["*"],
help="allowed headers")
parser.add_argument("--served-model-name",
type=str,
default=None,
help="The model name used in the API. If not "
"specified, the model name will be the same as "
"the huggingface name.")
parser.add_argument("--model_type",
type=str,
default=None,
help="The model name in huggingface.")
parser.add_argument("--tensor_parallel_size", type=int, default=1, help="number of gpus to use")
parser.add_argument("--gpu_memory_utilization", type=float, default=0.90)
parser = AsyncEngineArgs.add_cli_args(parser)
args = parser.parse_args()
logger.info(f"args: {args}")
global MODEL_TYPE
global MODEL_PATH
MODEL_TYPE = args.model_type
MODEL_PATH = args.model
if args.served_model_name is not None:
served_model = args.served_model_name
else:
served_model = args.model
engine_args = AsyncEngineArgs.from_cli_args(args)
engine = AsyncLLMEngine.from_engine_args(engine_args)
engine_model_config = asyncio.run(engine.get_model_config())
max_model_len = engine_model_config.max_model_len
A separate tokenizer to map token IDs to strings.
tokenizer = get_tokenizer(engine_args.tokenizer,
tokenizer_mode=engine_args.tokenizer_mode,
trust_remote_code=engine_args.trust_remote_code)
"""uvicorn日志打印中加上当前时间"""
def converter(*args):
cn_tz = timezone(timedelta(hours=8), name='Asia/Shanghai')
dt = datetime.now(cn_tz)
return dt.timetuple() # 返回一个tuple
logging.Formatter.converter = converter
log_config = uvicorn.config.LOGGING_CONFIG
log_config["formatters"]["access"]["fmt"] = ' %(levelprefix)s %(asctime)s - %(client_addr)s - "%(request_line)s" - %(status_code)s '
log_config["formatters"]["default"]["fmt"] = " %(levelprefix)s %(asctime)s %(message)s "
uvicorn.run(app,
host=args.host,
port=args.port,
log_level="info",
timeout_keep_alive=TIMEOUT_KEEP_ALIVE)
2条答案
按热度按时间vulvrdjw1#
当
tensor_parallel_size=2
被使用时,输出结果为:wlp8pajw2#
我曾经在GCP上遇到过这个问题,使用的是vLLM的旧版本。后来发现ray在GCP上检测GPU时出现了错误。这个问题已经在几周前修复了。你可以尝试使用vLLM的最新版本吗?因为你正在使用的是0.2.1版本的vLLM,这个版本已经比较旧了。