python ValueError:无效模式:“**”只能是整个路径组件

uplii1fm  于 2024-01-05  发布在  Python
关注(0)|答案(1)|浏览(751)

我在试着调整一个法学硕士
我的代码到目前为止:

  1. from datasets import load_dataset, DatasetDict, Dataset
  2. from transformers import (
  3. AutoTokenizer,
  4. AutoConfig,
  5. AutoModelForSequenceClassification,
  6. DataCollatorWithPadding,
  7. TrainingArguments,
  8. Trainer)
  9. from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
  10. import evaluate
  11. import torch
  12. import numpy as np
  13. # load dataset
  14. dataset = load_dataset('TokenBender/code_instructions_122k_alpaca_style')
  15. dataset

字符串
错误:

  1. ---------------------------------------------------------------------------
  2. ValueError Traceback (most recent call last)
  3. Cell In [12], line 2
  4. 1 # load dataset
  5. ----> 2 dataset = load_dataset('TokenBender/code_instructions_122k_alpaca_style')
  6. 3 dataset
  7. File /usr/local/lib/python3.9/dist-packages/datasets/load.py:1664, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, **config_kwargs)
  8. 1661 ignore_verifications = ignore_verifications or save_infos
  9. 1663 # Create a dataset builder
  10. -> 1664 builder_instance = load_dataset_builder(
  11. 1665 path=path,
  12. 1666 name=name,
  13. 1667 data_dir=data_dir,
  14. 1668 data_files=data_files,
  15. 1669 cache_dir=cache_dir,
  16. 1670 features=features,
  17. 1671 download_config=download_config,
  18. 1672 download_mode=download_mode,
  19. 1673 revision=revision,
  20. 1674 use_auth_token=use_auth_token,
  21. 1675 **config_kwargs,
  22. 1676 )
  23. 1678 # Return iterable dataset in case of streaming
  24. 1679 if streaming:
  25. File /usr/local/lib/python3.9/dist-packages/datasets/load.py:1490, in load_dataset_builder(path, name, data_dir, data_files, cache_dir, features, download_config, download_mode, revision, use_auth_token, **config_kwargs)
  26. 1488 download_config = download_config.copy() if download_config else DownloadConfig()
  27. 1489 download_config.use_auth_token = use_auth_token
  28. -> 1490 dataset_module = dataset_module_factory(
  29. 1491 path,
  30. 1492 revision=revision,
  31. 1493 download_config=download_config,
  32. 1494 download_mode=download_mode,
  33. 1495 data_dir=data_dir,
  34. 1496 data_files=data_files,
  35. 1497 )
  36. 1499 # Get dataset builder class from the processing script
  37. 1500 builder_cls = import_main_class(dataset_module.module_path)
  38. File /usr/local/lib/python3.9/dist-packages/datasets/load.py:1242, in dataset_module_factory(path, revision, download_config, download_mode, force_local_path, dynamic_modules_path, data_dir, data_files, **download_kwargs)
  39. 1237 if isinstance(e1, FileNotFoundError):
  40. 1238 raise FileNotFoundError(
  41. 1239 f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory. "
  42. 1240 f"Couldn't find '{path}' on the Hugging Face Hub either: {type(e1).__name__}: {e1}"
  43. 1241 ) from None
  44. -> 1242 raise e1 from None
  45. 1243 else:
  46. 1244 raise FileNotFoundError(
  47. 1245 f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory."
  48. 1246 )
  49. File /usr/local/lib/python3.9/dist-packages/datasets/load.py:1223, in dataset_module_factory(path, revision, download_config, download_mode, force_local_path, dynamic_modules_path, data_dir, data_files, **download_kwargs)
  50. 1215 return HubDatasetModuleFactoryWithScript(
  51. 1216 path,
  52. 1217 revision=revision,
  53. (...)
  54. 1220 dynamic_modules_path=dynamic_modules_path,
  55. 1221 ).get_module()
  56. 1222 else:
  57. -> 1223 return HubDatasetModuleFactoryWithoutScript(
  58. 1224 path,
  59. 1225 revision=revision,
  60. 1226 data_dir=data_dir,
  61. 1227 data_files=data_files,
  62. 1228 download_config=download_config,
  63. 1229 download_mode=download_mode,
  64. 1230 ).get_module()
  65. 1231 except Exception as e1: # noqa: all the attempts failed, before raising the error we should check if the module is already cached.
  66. 1232 try:
  67. File /usr/local/lib/python3.9/dist-packages/datasets/load.py:846, in HubDatasetModuleFactoryWithoutScript.get_module(self)
  68. 836 token = self.download_config.use_auth_token
  69. 837 hfh_dataset_info = HfApi(config.HF_ENDPOINT).dataset_info(
  70. 838 self.name,
  71. 839 revision=self.revision,
  72. 840 token=token,
  73. 841 timeout=100.0,
  74. 842 )
  75. 843 patterns = (
  76. 844 sanitize_patterns(self.data_files)
  77. 845 if self.data_files is not None
  78. --> 846 else get_patterns_in_dataset_repository(hfh_dataset_info)
  79. 847 )
  80. 848 data_files = DataFilesDict.from_hf_repo(
  81. 849 patterns,
  82. 850 dataset_info=hfh_dataset_info,
  83. 851 allowed_extensions=ALL_ALLOWED_EXTENSIONS,
  84. 852 )
  85. 853 infered_module_names = {
  86. 854 key: infer_module_for_data_files(data_files_list, use_auth_token=self.download_config.use_auth_token)
  87. 855 for key, data_files_list in data_files.items()
  88. 856 }
  89. File /usr/local/lib/python3.9/dist-packages/datasets/data_files.py:471, in get_patterns_in_dataset_repository(dataset_info)
  90. 469 resolver = partial(_resolve_single_pattern_in_dataset_repository, dataset_info)
  91. 470 try:
  92. --> 471 return _get_data_files_patterns(resolver)
  93. 472 except FileNotFoundError:
  94. 473 raise FileNotFoundError(
  95. 474 f"The dataset repository at '{dataset_info.id}' doesn't contain any data file."
  96. 475 ) from None
  97. File /usr/local/lib/python3.9/dist-packages/datasets/data_files.py:99, in _get_data_files_patterns(pattern_resolver)
  98. 97 try:
  99. 98 for pattern in patterns:
  100. ---> 99 data_files = pattern_resolver(pattern)
  101. 100 if len(data_files) > 0:
  102. 101 non_empty_splits.append(split)
  103. File /usr/local/lib/python3.9/dist-packages/datasets/data_files.py:303, in _resolve_single_pattern_in_dataset_repository(dataset_info, pattern, allowed_extensions)
  104. 301 data_files_ignore = FILES_TO_IGNORE
  105. 302 fs = HfFileSystem(repo_info=dataset_info)
  106. --> 303 glob_iter = [PurePath(filepath) for filepath in fs.glob(PurePath(pattern).as_posix()) if fs.isfile(filepath)]
  107. 304 matched_paths = [
  108. 305 filepath
  109. 306 for filepath in glob_iter
  110. 307 if filepath.name not in data_files_ignore and not filepath.name.startswith(".")
  111. 308 ]
  112. 309 if allowed_extensions is not None:
  113. File /usr/local/lib/python3.9/dist-packages/fsspec/spec.py:606, in AbstractFileSystem.glob(self, path, maxdepth, **kwargs)
  114. 602 depth = None
  115. 604 allpaths = self.find(root, maxdepth=depth, withdirs=True, detail=True, **kwargs)
  116. --> 606 pattern = glob_translate(path + ("/" if ends_with_sep else ""))
  117. 607 pattern = re.compile(pattern)
  118. 609 out = {
  119. 610 p: info
  120. 611 for p, info in sorted(allpaths.items())
  121. (...)
  122. 618 )
  123. 619 }
  124. File /usr/local/lib/python3.9/dist-packages/fsspec/utils.py:734, in glob_translate(pat)
  125. 732 continue
  126. 733 elif "**" in part:
  127. --> 734 raise ValueError(
  128. 735 "Invalid pattern: '**' can only be an entire path component"
  129. 736 )
  130. 737 if part:
  131. 738 results.extend(_translate(part, f"{not_sep}*", not_sep))
  132. ValueError: Invalid pattern: '**' can only be an entire path component


我试着在网上找东西,我找到的壁橱是这篇文章https://github.com/coala/coala/issues/401
但是我不能理解他们的解决方案。任何人都可以帮助我理解我所面临的错误的解决方案。谢谢。
我的图书馆版本:

  • peft:'0.6.0'
  • Torch :'2.1.2+cu121'
  • 数据集:'2.1.0'
  • transformers:'4.21.3'
bvuwiixz

bvuwiixz1#

该错误可能是由于datasets包中的更改(介于2.1到2.14之间)破坏了fsspec。它已在最新的datasets版本(2.15.0)中得到修复(参见问题中的讨论)。
使用pip install -U datasets更新您的安装以修复fspec ValueError
该解决方案适用于Python 3.10上的datasets版本2.10.1,因为它应该使用为版本> 2.15.0添加的修补程序更新包。

相关问题