这是我的环境版本：

torch:2.2.1
transformers: 4.39.0.dev0
vllm: custom compile at master@24aecf421a4ad5989697010963074904fead9a1b

我使用SqueezeLLM量化训练了我的llama-7B模型，并希望使用vllm加载。以下是我的代码和回溯。

#git clone https://github.com/SqueezeAILab/SqueezeLLM.git
#git clone https://github.com/kssteven418/SqueezeLLM-gradients.git
#conda create -n sqllm-grad python=3.9 -y
#conda activate sqllm-grad
#cd SqueezeLLM-gradients
#pip install -e .
#pip install -r requirements.txt(mod torch>=2.2.1)
### Compute gradients
CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=16 python run.py --output_dir [gradients_path] --model_name_or_path [model_path]

#cd SqueezeLLM/
#pip install -e .
#cd squeezellm
python setup_cuda.py install
#cd ../quantization
### Chunk model weights and gradients
python chunk_models.py --model [model_path] --output [model_chunk_path] --model_type llama

python chunk_models.py --model [gradients_path] --output [gradients_chunk_path] --model_type llama
### (Optional for D+S quantization) Outlier configuration generation
python generate_outlier_config.py --model [model_chunk_path] --range 1.8 --output [outlier_config]
### K-means clustering
python nuq.py --bit 4 --model_type llama --model [model_chunk_path] --gradient [gradient_chunk_path] --output [lut_path] --outlier_config [outlier_config]/outlier_config_o0.45.json --sensitivity 0.05
### Packing
python pack.py --model [model_path] --wbits 4 --folder [lut_path] --save [pack_path] --include_sparse --balance

AutoModelForCausalLM可以成功加载SqueezeLLM模型。

# load_quant from https://github.com/SqueezeAILab/SqueezeLLM/blob/main/llama.py#L136

from squeezellm.modelutils import *
from squeezellm.quant import *

def load_quant(model, checkpoint, wbits, include_sparse, topX):
    """
    topX is num_dense_channels.
    Number of dense channel used for hybrid kernel.
    """
    model = AutoModelForCausalLM.from_config(config, torch_dtype=torch.bfloat16)

    model = model.eval()
    layers = find_layers(model)

    state_dict = torch.load(os.path.join(checkpoint, "pack_model.pt"))

    # load sparse thresholds from checkpoint
    if include_sparse:
        num_vals = {}
        for k, v in state_dict.items():
            if "sparse_threshold." in k:
                key = k.replace("sparse_threshold.", "")
                num_vals[key] = v
        for k, v in num_vals.items():
            del state_dict["sparse_threshold." + k]
    else:
        num_vals = None

    # replace layers
    for name in ["lm_head"]:
        if name in layers:
            del layers[name]
    make_quant_lut(
        model, layers, wbits, include_sparse=include_sparse, numvals=num_vals, topX=topX
    )
    del layers

    print("Loading model ...")
    state_dict = torch.load(os.path.join(checkpoint, "pack_model.pt"))
    model.load_state_dict(state_dict, strict=False)
    model.seqlen = 2048
    print("Done.")

    return model
model = load_quant("llama-2", adapter_path, 4, include_sparse=True, topX=10)
model = model.to(DEV)
model.eval()

但是vllm加载失败，出现错误。

from vllm import LLM, SamplingParams
import torch
model_path = '/root/ckpt161_quantization_w4_s0.45'

if __name__ == '__main__':
    llm = LLM(model=model_path, quantization="squeezellm", dtype=torch.float16)
    prompts = [
    "Hello, my name is"
        ]
    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
    outputs = llm.generate(prompts, sampling_params)
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

堆栈跟踪：

Traceback (most recent call last):
  File "/root/python/dictionary/train/testbatchvllm.py", line 58, in <module>
    llm = LLM(model=model_path, quantization="squeezellm", dtype=torch.float16)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/python/github.com/vllm/vllm/entrypoints/llm.py", line 109, in __init__
    self.llm_engine = LLMEngine.from_engine_args(engine_args)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/python/github.com/vllm/vllm/engine/llm_engine.py", line 412, in from_engine_args
    engine = cls(*engine_configs,
             ^^^^^^^^^^^^^^^^^^^^
  File "/root/python/github.com/vllm/vllm/engine/llm_engine.py", line 142, in __init__
    self._init_workers()
  File "/root/python/github.com/vllm/vllm/engine/llm_engine.py", line 200, in _init_workers
    self._run_workers("load_model")
  File "/root/python/github.com/vllm/vllm/engine/llm_engine.py", line 1086, in _run_workers
    driver_worker_output = getattr(self.driver_worker,
                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/python/github.com/vllm/vllm/worker/worker.py", line 99, in load_model
    self.model_runner.load_model()
  File "/root/python/github.com/vllm/vllm/worker/model_runner.py", line 88, in load_model
    self.model = get_model(self.model_config,
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/python/github.com/vllm/vllm/model_executor/utils.py", line 52, in get_model
    return get_model_fn(model_config, device_config, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/python/github.com/vllm/vllm/model_executor/model_loader.py", line 86, in get_model
    model.load_weights(model_config.model, model_config.download_dir,
  File "/root/python/github.com/vllm/vllm/model_executor/models/llama.py", line 388, in load_weights
    param = params_dict[name]
            ~~~~~~~~~~~^^^^^^
KeyError: 'model.layers.0.self_attn.qkv_proj.rows'