I uploaded a custom huggingface dataset and want to load it, but I met ValueError: Invalid pattern: '**' can only be an entire path component.
from datasets import load_dataset
print('sarted')
gsm8k = load_dataset("gsm8k","main") #this works fine
gsm8k = load_dataset("jijivski/mock_gsm8k")#,download_mode="force_redownload")
# I met 'ValueError: Invalid pattern: '**' can only be an entire path component' here
print('loaded')
gsm8k
Error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[7], line 5
2 get_ipython().system('pip install -U datasets')
3 from datasets import load_dataset
----> 5 dataset = load_dataset("jijivski/mock_gsm8k")
File /opt/conda/lib/python3.10/site-packages/datasets/load.py:1664, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, **config_kwargs)
1661 ignore_verifications = ignore_verifications or save_infos
1663 # Create a dataset builder
-> 1664 builder_instance = load_dataset_builder(
1665 path=path,
1666 name=name,
1667 data_dir=data_dir,
1668 data_files=data_files,
1669 cache_dir=cache_dir,
1670 features=features,
1671 download_config=download_config,
1672 download_mode=download_mode,
1673 revision=revision,
1674 use_auth_token=use_auth_token,
1675 **config_kwargs,
1676 )
1678 # Return iterable dataset in case of streaming
1679 if streaming:
File /opt/conda/lib/python3.10/site-packages/datasets/load.py:1490, in load_dataset_builder(path, name, data_dir, data_files, cache_dir, features, download_config, download_mode, revision, use_auth_token, **config_kwargs)
1488 download_config = download_config.copy() if download_config else DownloadConfig()
1489 download_config.use_auth_token = use_auth_token
-> 1490 dataset_module = dataset_module_factory(
1491 path,
1492 revision=revision,
1493 download_config=download_config,
1494 download_mode=download_mode,
1495 data_dir=data_dir,
1496 data_files=data_files,
1497 )
1499 # Get dataset builder class from the processing script
1500 builder_cls = import_main_class(dataset_module.module_path)
File /opt/conda/lib/python3.10/site-packages/datasets/load.py:1242, in dataset_module_factory(path, revision, download_config, download_mode, force_local_path, dynamic_modules_path, data_dir, data_files, **download_kwargs)
1237 if isinstance(e1, FileNotFoundError):
1238 raise FileNotFoundError(
1239 f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory. "
1240 f"Couldn't find '{path}' on the Hugging Face Hub either: {type(e1).__name__}: {e1}"
1241 ) from None
-> 1242 raise e1 from None
1243 else:
1244 raise FileNotFoundError(
1245 f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory."
1246 )
File /opt/conda/lib/python3.10/site-packages/datasets/load.py:1230, in dataset_module_factory(path, revision, download_config, download_mode, force_local_path, dynamic_modules_path, data_dir, data_files, **download_kwargs)
1215 return HubDatasetModuleFactoryWithScript(
1216 path,
1217 revision=revision,
(...)
1220 dynamic_modules_path=dynamic_modules_path,
1221 ).get_module()
1222 else:
1223 return HubDatasetModuleFactoryWithoutScript(
1224 path,
1225 revision=revision,
1226 data_dir=data_dir,
1227 data_files=data_files,
1228 download_config=download_config,
1229 download_mode=download_mode,
-> 1230 ).get_module()
1231 except Exception as e1: # noqa: all the attempts failed, before raising the error we should check if the module is already cached.
1232 try:
File /opt/conda/lib/python3.10/site-packages/datasets/load.py:846, in HubDatasetModuleFactoryWithoutScript.get_module(self)
836 token = self.download_config.use_auth_token
837 hfh_dataset_info = HfApi(config.HF_ENDPOINT).dataset_info(
838 self.name,
839 revision=self.revision,
840 token=token,
841 timeout=100.0,
842 )
843 patterns = (
844 sanitize_patterns(self.data_files)
845 if self.data_files is not None
--> 846 else get_patterns_in_dataset_repository(hfh_dataset_info)
847 )
848 data_files = DataFilesDict.from_hf_repo(
849 patterns,
850 dataset_info=hfh_dataset_info,
851 allowed_extensions=ALL_ALLOWED_EXTENSIONS,
852 )
853 infered_module_names = {
854 key: infer_module_for_data_files(data_files_list, use_auth_token=self.download_config.use_auth_token)
855 for key, data_files_list in data_files.items()
856 }
File /opt/conda/lib/python3.10/site-packages/datasets/data_files.py:471, in get_patterns_in_dataset_repository(dataset_info)
469 resolver = partial(_resolve_single_pattern_in_dataset_repository, dataset_info)
470 try:
--> 471 return _get_data_files_patterns(resolver)
472 except FileNotFoundError:
473 raise FileNotFoundError(
474 f"The dataset repository at '{dataset_info.id}' doesn't contain any data file."
475 ) from None
File /opt/conda/lib/python3.10/site-packages/datasets/data_files.py:99, in _get_data_files_patterns(pattern_resolver)
97 try:
98 for pattern in patterns:
---> 99 data_files = pattern_resolver(pattern)
100 if len(data_files) > 0:
101 non_empty_splits.append(split)
File /opt/conda/lib/python3.10/site-packages/datasets/data_files.py:303, in _resolve_single_pattern_in_dataset_repository(dataset_info, pattern, allowed_extensions)
301 data_files_ignore = FILES_TO_IGNORE
302 fs = HfFileSystem(repo_info=dataset_info)
--> 303 glob_iter = [PurePath(filepath) for filepath in fs.glob(PurePath(pattern).as_posix()) if fs.isfile(filepath)]
304 matched_paths = [
305 filepath
306 for filepath in glob_iter
307 if filepath.name not in data_files_ignore and not filepath.name.startswith(".")
308 ]
309 if allowed_extensions is not None:
File /opt/conda/lib/python3.10/site-packages/fsspec/spec.py:606, in AbstractFileSystem.glob(self, path, maxdepth, **kwargs)
602 depth = None
604 allpaths = self.find(root, maxdepth=depth, withdirs=True, detail=True, **kwargs)
--> 606 pattern = glob_translate(path + ("/" if ends_with_sep else ""))
607 pattern = re.compile(pattern)
609 out = {
610 p: info
611 for p, info in sorted(allpaths.items())
(...)
618 )
619 }
File /opt/conda/lib/python3.10/site-packages/fsspec/utils.py:734, in glob_translate(pat)
732 continue
733 elif "**" in part:
--> 734 raise ValueError(
735 "Invalid pattern: '**' can only be an entire path component"
736 )
737 if part:
738 results.extend(_translate(part, f"{not_sep}*", not_sep))
ValueError: Invalid pattern: '**' can only be an entire path component
I have checked this similar question here: ValueError: Invalid pattern: '**' can only be an entire path component - Stack Overflow
but failed to solve my problem.
Update solution does not work for me, wondering if this problem is related to my dataset or the code to load it (I can load some other hunggingface datasets)
I Successfully installed datasets-2.15.0 fsspec-2023.10.0 pyarrow-hotfix-0.6 and the latest datasets 2.16.1, I met the same problem on online environment like kaggle.
Wondering if there is a way to figure out if my dataset have a problem...