huggingface · lhoestq · Apr 23, 2024 · albertvillanova · Apr 23, 2024 · lhoestq
diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -60,6 +60,7 @@
 import pyarrow as pa
 import pyarrow.compute as pc
 from fsspec.core import url_to_fs
+from fsspec.utils import stringify_path
 from huggingface_hub import (
     CommitInfo,
     CommitOperationAdd,
@@ -1449,7 +1450,7 @@ def save_to_disk(
         If you want to store paths or urls, please use the Value("string") type.
 
         Args:
-            dataset_path (`str`):
+            dataset_path (`PathLike`):
                 Path (e.g. `dataset/train`) or remote URI (e.g. `s3://my-bucket/dataset/train`)
                 of the dataset directory where the dataset will be saved to.
             fs (`fsspec.spec.AbstractFileSystem`, *optional*):
@@ -1512,7 +1513,7 @@ def save_to_disk(
         num_shards = num_shards if num_shards is not None else num_proc
 
         fs: fsspec.AbstractFileSystem
-        fs, _ = url_to_fs(dataset_path, **(storage_options or {}))
+        fs, _ = url_to_fs(stringify_path(dataset_path), **(storage_options or {}))
 
         if not is_remote_filesystem(fs):
             parent_cache_files_paths = {
@@ -1649,7 +1650,7 @@ def _build_local_temp_path(uri_or_path: str) -> Path:
 
     @staticmethod
     def load_from_disk(
-        dataset_path: str,
+        dataset_path: PathLike,
         fs="deprecated",
         keep_in_memory: Optional[bool] = None,
         storage_options: Optional[dict] = None,
@@ -1659,7 +1660,7 @@ def load_from_disk(
         filesystem using any implementation of `fsspec.spec.AbstractFileSystem`.
 
         Args:
-            dataset_path (`str`):
+            dataset_path (`PathLike`):
                 Path (e.g. `"dataset/train"`) or remote URI (e.g. `"s3//my-bucket/dataset/train"`)
                 of the dataset directory where the dataset will be loaded from.
             fs (`fsspec.spec.AbstractFileSystem`, *optional*):
@@ -1702,7 +1703,7 @@ def load_from_disk(
             storage_options = fs.storage_options
 
         fs: fsspec.AbstractFileSystem
-        fs, dataset_path = url_to_fs(dataset_path, **(storage_options or {}))
+        fs, dataset_path = url_to_fs(stringify_path(dataset_path), **(storage_options or {}))
 
         dest_dataset_path = dataset_path
         dataset_dict_json_path = posixpath.join(dest_dataset_path, config.DATASETDICT_JSON_FILENAME)

diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py
@@ -13,6 +13,7 @@
 import fsspec
 import numpy as np
 from fsspec.core import url_to_fs
+from fsspec.utils import stringify_path
 from huggingface_hub import (
     CommitInfo,
     CommitOperationAdd,
@@ -1231,7 +1232,7 @@ def save_to_disk(
         If you want to store paths or urls, please use the Value("string") type.
 
         Args:
-            dataset_dict_path (`str`):
+            dataset_dict_path (`PathLike`):
                 Path (e.g. `dataset/train`) or remote URI
                 (e.g. `s3://my-bucket/dataset/train`) of the dataset dict directory where the dataset dict will be
                 saved to.
@@ -1281,7 +1282,7 @@ def save_to_disk(
             storage_options = fs.storage_options
 
         fs: fsspec.AbstractFileSystem
-        fs, _ = url_to_fs(dataset_dict_path, **(storage_options or {}))
+        fs, _ = url_to_fs(stringify_path(dataset_dict_path), **(storage_options or {}))
 
         if num_shards is None:
             num_shards = {k: None for k in self}

diff --git a/src/datasets/load.py b/src/datasets/load.py
@@ -36,6 +36,7 @@
 import requests
 import yaml
 from fsspec.core import url_to_fs
+from fsspec.utils import stringify_path
 from huggingface_hub import DatasetCard, DatasetCardData, HfApi, HfFileSystem
 
 from . import config
@@ -89,6 +90,7 @@
 from .utils.logging import get_logger
 from .utils.metadata import MetadataConfigs
 from .utils.py_utils import get_imports, lock_importable_file
+from .utils.typing import PathLike
 from .utils.version import Version
 
 
@@ -2632,14 +2634,17 @@ def load_dataset(
 
 
 def load_from_disk(
-    dataset_path: str, fs="deprecated", keep_in_memory: Optional[bool] = None, storage_options: Optional[dict] = None
+    dataset_path: PathLike,
+    fs="deprecated",
+    keep_in_memory: Optional[bool] = None,
+    storage_options: Optional[dict] = None,
 ) -> Union[Dataset, DatasetDict]:
     """
     Loads a dataset that was previously saved using [`~Dataset.save_to_disk`] from a dataset directory, or
     from a filesystem using any implementation of `fsspec.spec.AbstractFileSystem`.
 
     Args:
-        dataset_path (`str`):
+        dataset_path (`PathLike`):
             Path (e.g. `"dataset/train"`) or remote URI (e.g.
             `"s3://my-bucket/dataset/train"`) of the [`Dataset`] or [`DatasetDict`] directory where the dataset will be
             loaded from.
@@ -2684,7 +2689,7 @@ def load_from_disk(
         storage_options = fs.storage_options
 
     fs: fsspec.AbstractFileSystem
-    fs, *_ = url_to_fs(dataset_path, **(storage_options or {}))
+    fs, *_ = url_to_fs(stringify_path(dataset_path), **(storage_options or {}))
     if not fs.exists(dataset_path):
         raise FileNotFoundError(f"Directory {dataset_path} not found")
     if fs.isfile(posixpath.join(dataset_path, config.DATASET_INFO_FILENAME)) and fs.isfile(