Baichuan-7B 给大家一个方便运行的程序代码(cli_demo.py)，对多GPU支持更友好些，需要的可以复制过去跑一下

7fhtutme 于 5个月前发布在其他

关注(0)|答案(8)|浏览(92)

import os
import platform
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# 特点：

# [#1](https://github.com/baichuan-inc/Baichuan-7B/pull/1) .自动支持cpu及gpu模式

# [#2](https://github.com/baichuan-inc/Baichuan-7B/issues/2) .使用gpu时使用half模式载入，减少一半显存

# [#3](https://github.com/baichuan-inc/Baichuan-7B/issues/3) .使用gpu时多显卡模式自动分布载入

# [#4](https://github.com/baichuan-inc/Baichuan-7B/issues/4) .暂不支持 聊天上下文功能

# [#5](https://github.com/baichuan-inc/Baichuan-7B/issues/5) .暂不支持 打字输出效果 (所以答案太长时会卡死，可以调整MAX_TOKENS来暂时解决)

# 作者： [lanny.yang.sh@gmail.com](mailto:lanny.yang.sh@gmail.com) 个人兴趣开发者/杨，有问题也可以邮我

def auto_configure_device_map(num_gpus: int):
    num_trans_layers = 32
    per_gpu_layers = num_trans_layers // num_gpus
    device_map = {'model.embed_tokens': 0,
                  'model.norm': num_gpus-1, 'lm_head': num_gpus-1}
    for i in range(num_trans_layers):
        device_map[f'model.layers.{i}'] = int(i//per_gpu_layers)
    return device_map

# MODEL_NAME = "../baichuan-7B-model"

MODEL_NAME = "baichuan-inc/baichuan-7B"
NUM_GPUS = torch.cuda.device_count() if torch.cuda.is_available() else None
MAX_TOKENS = 512
device_map = auto_configure_device_map(NUM_GPUS) if NUM_GPUS > 0 else None
device = torch.device("cuda") if NUM_GPUS > 0 else torch.device("cpu")
device_dtype = torch.half if NUM_GPUS > 0 else torch.float
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, trust_remote_code=True, device_map=device_map, torch_dtype=device_dtype)
model = model.eval()
os_name = platform.system()
clear_command = 'cls' if os_name == 'Windows' else 'clear'
hello_string = "欢迎使用 BaiChuan-7B 模型，输入内容即可进行对话，clear 清空对话历史，stop/exit/quit 终止程序"

def build_prompt(history):
    prompt = hello_string
    for query, response in history:
        prompt += f"\n\n用户： {query}"
        prompt += f"\n回复： {response}"
    return prompt

history = []
print(hello_string)
while True:

Baichuan-7B

来源：https://github.com/baichuan-inc/Baichuan-7B/issues/50

8条答案

按热度按时间

nfzehxib1#

GPU使用了half浮点数，两块2080TI 11G可以流畅运行，两块10G的显卡也能运行起来，两块8G的不够。但是对电脑内存好像在载入瞬间要达到40多G（7G4+7G2），如果显存大内存小的可以去除half模式

赞(0）回复(0）举报 5个月前

z4iuyo4d2#

报错：文件 "/home/hope/work/baichuan-7B/try2.py" 的第15行，num_trans_layers = 32。
缩进错误：在第14行的函数定义后，预期有一个缩进的代码块。

赞(0）回复(0）举报 5个月前

fcipmucu3#

这个需要缩进一下，因为GitHub的Issue不适合粘贴代码。如果你缩进一下，就不会出错了。

赞(0）回复(0）举报 5个月前

avkwfej44#

谢谢啦！

赞(0）回复(0）举报 5个月前

cld4siwp5#

稍微整理了下格式

import os
import platform
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

#特点:
#1.自动支持cpu及gpu模式
#2.使用gpu时使用half模式载入，减少一半显存
#3.使用gpu时多显卡模式自动分布载入
#4.暂不支持 聊天上下文功能
#5.暂不支持 打字输出效果 (所以答案太长时会卡死，可以调整MAX_TOKENS来暂时解决)
#作者: lanny.yang.sh@gmail.com 个人兴趣开发者/杨,有问题也可以邮我

def auto_configure_device_map(num_gpus: int):
    num_trans_layers = 32
    per_gpu_layers = num_trans_layers / num_gpus
    device_map = {'model.embed_tokens': 0,
    'model.norm': num_gpus-1, 'lm_head': num_gpus-1}
    for i in range(num_trans_layers):
        device_map[f'model.layers.{i}'] = int(i//per_gpu_layers)
    return device_map

def build_prompt(history):
    prompt = hello_string
    for query, response in history:
        prompt += f"\n\n用户： {query}"
        prompt += f"\n回复： {response}"
    return prompt

#MODEL_NAME = "../baichuan-7B-model"

MODEL_NAME = "baichuan-inc/baichuan-7B"

NUM_GPUS = torch.cuda.device_count() if torch.cuda.is_available() else None
MAX_TOKENS = 512
device_map = auto_configure_device_map(NUM_GPUS) if NUM_GPUS>0 else None
device = torch.device("cuda") if NUM_GPUS>0 else torch.device("cpu")
device_dtype = torch.half if NUM_GPUS>0 else torch.float

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, trust_remote_code=True, device_map=device_map, torch_dtype=device_dtype)
model = model.eval()

os_name = platform.system()
clear_command = 'cls' if os_name == 'Windows' else 'clear'
hello_string = "欢迎使用 BaiChuan-7B 模型，输入内容即可进行对话，clear 清空对话历史，stop/exit/quit 终止程序"


history = []
print(hello_string)

while True:

    query = input("\n用户： ")

    if query.strip() in ["stop", "stop()", "exit", "exit()", "quit", "quit()", "q", "q()"]:
        break
    if query.strip() in ["clear", "clear()", "cls", "cls()"]:
        history = []
        os.system(clear_command)
        print(hello_string)
        continue

    inputs = tokenizer(query, return_tensors='pt')
    inputs.input_ids = inputs.input_ids.to(device)
    inputs.attention_mask = inputs.attention_mask.to(device)
    pred = model.generate(inputs=inputs.input_ids, attention_mask=inputs.attention_mask, max_new_tokens=MAX_TOKENS, repetition_penalty=1.1)
    response = tokenizer.decode(pred.cpu().tolist()[0])
    response = response[len(query)+response.find(query):]
    if response[-4:] == "</s>": response = response[:-4]

    history += [(query, response)]
    print(f"\n回复： {response}")

    os.system(clear_command)
    print(build_prompt(history), flush=True)

赞(0）回复(0）举报 5个月前

4urapxun6#

多谢啦，一会测试一下

赞(0）回复(0）举报 5个月前

1qczuiv07#

稍微整理了下格式

import os
import platform
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

#特点:
#1.自动支持cpu及gpu模式
#2.使用gpu时使用half模式载入，减少一半显存
#3.使用gpu时多显卡模式自动分布载入
#4.暂不支持 聊天上下文功能
#5.暂不支持 打字输出效果 (所以答案太长时会卡死，可以调整MAX_TOKENS来暂时解决)
#作者: lanny.yang.sh@gmail.com 个人兴趣开发者/杨,有问题也可以邮我

def auto_configure_device_map(num_gpus: int):
    num_trans_layers = 32
    per_gpu_layers = num_trans_layers / num_gpus
    device_map = {'model.embed_tokens': 0,
    'model.norm': num_gpus-1, 'lm_head': num_gpus-1}
    for i in range(num_trans_layers):
        device_map[f'model.layers.{i}'] = int(i//per_gpu_layers)
    return device_map

def build_prompt(history):
    prompt = hello_string
    for query, response in history:
        prompt += f"\n\n用户： {query}"
        prompt += f"\n回复： {response}"
    return prompt

#MODEL_NAME = "../baichuan-7B-model"

MODEL_NAME = "baichuan-inc/baichuan-7B"

NUM_GPUS = torch.cuda.device_count() if torch.cuda.is_available() else None
MAX_TOKENS = 512
device_map = auto_configure_device_map(NUM_GPUS) if NUM_GPUS>0 else None
device = torch.device("cuda") if NUM_GPUS>0 else torch.device("cpu")
device_dtype = torch.half if NUM_GPUS>0 else torch.float

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, trust_remote_code=True, device_map=device_map, torch_dtype=device_dtype)
model = model.eval()

os_name = platform.system()
clear_command = 'cls' if os_name == 'Windows' else 'clear'
hello_string = "欢迎使用 BaiChuan-7B 模型，输入内容即可进行对话，clear 清空对话历史，stop/exit/quit 终止程序"


history = []
print(hello_string)

while True:

    query = input("\n用户： ")

    if query.strip() in ["stop", "stop()", "exit", "exit()", "quit", "quit()", "q", "q()"]:
        break
    if query.strip() in ["clear", "clear()", "cls", "cls()"]:
        history = []
        os.system(clear_command)
        print(hello_string)
        continue

    inputs = tokenizer(query, return_tensors='pt')
    inputs.input_ids = inputs.input_ids.to(device)
    inputs.attention_mask = inputs.attention_mask.to(device)
    pred = model.generate(inputs=inputs.input_ids, attention_mask=inputs.attention_mask, max_new_tokens=MAX_TOKENS, repetition_penalty=1.1)
    response = tokenizer.decode(pred.cpu().tolist()[0])
    response = response[len(query)+response.find(query):]
    if response[-4:] == "</s>": response = response[:-4]

    history += [(query, response)]
    print(f"\n回复： {response}")

    os.system(clear_command)
    print(build_prompt(history), flush=True)

如上代码在A10里面一遍跑起来了，大概用到的一些命令

conda create -n ai python=3.10
conda activate ai
git clone https://github.com/baichuan-inc/baichuan-7B.git
cd baichuan-7B/
pip install -r requirements.txt
pip install accelerate
python ./cli_demo.py

赞(0）回复(0）举报 5个月前

hjqgdpho8#

@lilongthinker@lanny2018
为何我使用多gpu的时候会出现以下错误