无法构建管道。在运行此代码后，下载/加载模型后代码会一直运行，无法停止：
pipe = mii.pipeline("facebook/opt-1.3b")
也尝试过：
facebook/opt-350m
facebook/opt-125m

PS: 我已经等待了至少3个小时，然后中断了执行。此外，我还没有尝试其他架构，因为我的本地GPU内存不足。

以下是我的系统配置供您参考：

通过WSL2.0在Windows 11 64位上运行Ubuntu
CUDA 11.5,6 GB NVIDIA GeForce RTX 3060笔记本电脑GPU
Intel Core i7-10870H(第10代),16GB RAM

Downloading (…)lve/main/config.json: 100%
653/653 [00:00<00:00, 36.8kB/s]
Downloading (…)neration_config.json: 100%
137/137 [00:00<00:00, 9.35kB/s]
Fetching 6 files: 100%
6/6 [15:42<00:00, 314.04s/it]
Downloading (…)okenizer_config.json: 100%
685/685 [00:00<00:00, 46.8kB/s]
Downloading (…)2dfb61d62/vocab.json: 100%
899k/899k [00:00<00:00, 1.08MB/s]
Downloading (…)cial_tokens_map.json: 100%
441/441 [00:00<00:00, 62.8kB/s]
Downloading pytorch_model.bin: 100%
2.63G/2.63G [15:40<00:00, 3.16MB/s]
[2023-11-14 19:25:02,408] [INFO] [engine_v2.py:64:__init__] Building model...
Using /home/<username>/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/<username>/.cache/torch_extensions/py310_cu121/inference_core_ops/build.ninja...
Building extension module inference_core_ops...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
Loading extension module inference_core_ops...
ninja: no work to do.
Time to load inference_core_ops op: 0.09155440330505371 seconds
Using /home/<username>/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...

在中断执行后，出现了以下错误。这是因为某个bug导致睡眠设置为无限大或某个巨大的问题吗？

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
Cell In[2], line 2
      1 import mii
----> 2 pipe = mii.pipeline("facebook/opt-1.3b")

File ~/.local/lib/python3.10/site-packages/mii/api.py:159, in pipeline(model_name_or_path, model_config, **kwargs)
    155 if remaining_kwargs:
    156     raise UnknownArgument(
    157         f"Keyword argument(s) {remaining_kwargs.keys()} not recognized")
--> 159 inference_engine = load_model(model_config)
    160 tokenizer = load_tokenizer(model_config)
    161 inference_pipeline = MIIPipeline(
    162     inference_engine=inference_engine,
    163     tokenizer=tokenizer,
    164     model_config=model_config,
    165 )

File ~/.local/lib/python3.10/site-packages/mii/modeling/models.py:17, in load_model(model_config)
     15 provider = model_config.provider
     16 if provider == ModelProvider.HUGGING_FACE:
---> 17     inference_engine = build_hf_engine(
     18         path=model_config.model_name_or_path,
     19         engine_config=model_config.inference_engine_config)
     20 else:
     21     raise ValueError(f"Unknown model provider {provider}")

File ~/.local/lib/python3.10/site-packages/deepspeed/inference/v2/engine_factory.py:46, in build_hf_engine(path, engine_config, debug_level, random_weights_config, fill_random)
     43 else:
     44     raise ValueError(f"Unsupported model type {model_config.model_type}")
---> 46 return InferenceEngineV2(policy, engine_config)

File ~/.local/lib/python3.10/site-packages/deepspeed/inference/v2/engine_v2.py:65, in InferenceEngineV2.__init__(self, policy, engine_config)
     63 # Build model from policy
     64 inference_logger().info("Building model...")
---> 65 self._model = self._policy.build_model(self._config, self._base_mp_group)
     66 inference_logger().info("Model built.")
     68 # Create state manager

File ~/.local/lib/python3.10/site-packages/deepspeed/inference/v2/model_implementations/inference_policy_base.py:110, in InferenceV2Policy.build_model(self, engine_config, mp_group)
     94 def build_model(self, engine_config: RaggedInferenceEngineConfig, mp_group: Any) -> DSInferenceModelBase:
     95     """
     96     Completely instantiate the inference model. This will both create the ops needed to run the
     97     model, as well as load the model parameters via the checkpoint engine. For more context
   (...)
    108             run by the engine.
    109     """
--> 110     self.model = self.instantiate_model(engine_config, mp_group)
    111     self.populate_model_parameters()
    112     return self.model

File ~/.local/lib/python3.10/site-packages/deepspeed/inference/v2/model_implementations/opt/policy.py:23, in OPTPolicy.instantiate_model(self, engine_config, mp_group)
     22 def instantiate_model(self, engine_config: RaggedInferenceEngineConfig, mp_group: Any) -> OPTInferenceModel:
---> 23     return OPTInferenceModel(config=self._model_config, engine_config=engine_config, base_mp_group=mp_group)

File ~/.local/lib/python3.10/site-packages/deepspeed/inference/v2/model_implementations/inference_transformer_base.py:229, in DSTransformerModelBase.__init__(self, config, engine_config, base_mp_group)
    227 self.make_norm_layer()
    228 self.make_qkv_layer()
--> 229 self.make_attn_layer()
    230 self.make_attn_out_layer()
    231 self.make_mlp_1_layer()

File ~/.local/lib/python3.10/site-packages/deepspeed/inference/v2/model_implementations/inference_transformer_base.py:346, in DSTransformerModelBase.make_attn_layer(self)
    334 softmax_scale = 1.0 / (self.head_size**0.5)
    336 attn_config = DSSelfAttentionConfig(max_tokens=self._engine_config.state_manager.max_ragged_batch_size,
    337                                     n_heads_q=self.n_heads_q_local,
    338                                     n_heads_kv=self.n_heads_kv_local,
   (...)
    343                                     output_dtype=self.activation_dtype,
    344                                     positional_embedding_type=self.positional_embedding_type)
--> 346 self.attn = heuristics.instantiate_attention(attn_config, self._engine_config)

File ~/.local/lib/python3.10/site-packages/deepspeed/inference/v2/modules/heuristics.py:53, in instantiate_attention(attention_config, engine_config)
     51 # Currently, we only have one implementation, so we just return it.
     52 config = ConfigBundle(name="dense_blocked_attention", config=attention_config)
---> 53 return DSSelfAttentionRegistry.instantiate_config(config)

File ~/.local/lib/python3.10/site-packages/deepspeed/inference/v2/modules/module_registry.py:39, in DSModuleRegistryBase.instantiate_config(cls, config_bundle)
     36 if not target_implementation.supports_config(config_bundle.config):
     37     raise ValueError(f"Config {config_bundle.config} is not supported by {target_implementation}")
---> 39 return cls.registry[config_bundle.name](config_bundle.config, config_bundle.implementation_config)

File ~/.local/lib/python3.10/site-packages/deepspeed/inference/v2/modules/implementations/attention/dense_blocked_attention.py:79, in DSDenseBlockedAttention.__init__(self, config, implementation_config)
     77 embed_type = PositionalEmbeddingType(config.positional_embedding_type)
     78 if embed_type == PositionalEmbeddingType.none:
---> 79     self._kv_copy = LinearBlockedKVCopy(self._config.head_size, self._config.n_heads_q,
     80                                         self._config.n_heads_kv, self._config.input_dtype)
     81 elif embed_type == PositionalEmbeddingType.rotate_half:
     82     use_trained_freqs = "trained_freqs" in self._config.positional_embedding_args and self._config.positional_embedding_args[
     83         "trained_freqs"]

File ~/.local/lib/python3.10/site-packages/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/linear_blocked_kv_copy.py:53, in LinearBlockedKVCopy.__init__(self, head_size, n_q_heads, n_kv_heads, dtype)
     49 if dtype not in LinearBlockedKVCopy.supported_dtypes:
     50     raise ValueError("Unsupported data type: {}, supported_dtypes are {}".format(
     51         dtype, LinearBlockedKVCopy.supported_dtypes))
---> 53 inf_module = RaggedOpsBuilder().load()
     54 self.kernel = inf_module.linear_kv_copy
     55 self.head_size = head_size

File ~/.local/lib/python3.10/site-packages/deepspeed/ops/op_builder/builder.py:448, in OpBuilder.load(self, verbose)
    446     return op_module
    447 else:
--> 448     return self.jit_load(verbose)

File ~/.local/lib/python3.10/site-packages/deepspeed/ops/op_builder/builder.py:492, in OpBuilder.jit_load(self, verbose)
    489 if self.is_rocm_pytorch():
    490     cxx_args.append("-D__HIP_PLATFORM_AMD__=1")
--> 492 op_module = load(name=self.name,
    493                  sources=self.strip_empty_entries(sources),
    494                  extra_include_paths=self.strip_empty_entries(extra_include_paths),
    495                  extra_cflags=cxx_args,
    496                  extra_cuda_cflags=nvcc_args,
    497                  extra_ldflags=self.strip_empty_entries(self.extra_ldflags()),
    498                  verbose=verbose)
    500 build_duration = time.time() - start_build
    501 if verbose:

File ~/.local/lib/python3.10/site-packages/torch/utils/cpp_extension.py:1308, in load(name, sources, extra_cflags, extra_cuda_cflags, extra_ldflags, extra_include_paths, build_directory, verbose, with_cuda, is_python_module, is_standalone, keep_intermediates)
   1216 def load(name,
   1217          sources: Union[str, List[str]],
   1218          extra_cflags=None,
   (...)
   1226          is_standalone=False,
   1227          keep_intermediates=True):
   1228     r'''
   1229     Loads a PyTorch C++ extension just-in-time (JIT).
   1230 
   (...)
   1306         ...     verbose=True)
   1307     '''
-> 1308     return _jit_compile(
   1309         name,
   1310         [sources] if isinstance(sources, str) else sources,
   1311         extra_cflags,
   1312         extra_cuda_cflags,
   1313         extra_ldflags,
   1314         extra_include_paths,
   1315         build_directory or _get_build_directory(name, verbose),
   1316         verbose,
   1317         with_cuda,
   1318         is_python_module,
   1319         is_standalone,
   1320         keep_intermediates=keep_intermediates)

File ~/.local/lib/python3.10/site-packages/torch/utils/cpp_extension.py:1724, in _jit_compile(name, sources, extra_cflags, extra_cuda_cflags, extra_ldflags, extra_include_paths, build_directory, verbose, with_cuda, is_python_module, is_standalone, keep_intermediates)
   1722             baton.release()
   1723     else:
-> 1724         baton.wait()
   1725 elif verbose:
   1726     print('No modifications detected for re-loaded extension '
   1727           f'module {name}, skipping build step...',
   1728           file=sys.stderr)

File ~/.local/lib/python3.10/site-packages/torch/utils/file_baton.py:42, in FileBaton.wait(self)
     35 '''
     36 Periodically sleeps for a certain amount until the baton is released.
     37 
     38 The amount of time slept depends on the ``wait_seconds`` parameter
     39 passed to the constructor.
     40 '''
     41 while os.path.exists(self.lock_file_path):
---> 42     time.sleep(self.wait_seconds)

KeyboardInterrupt: