Support dataframes in DefaultExperiment (#160)

* Add support for dataframes in generic dataset * Add class method to default experiment for easier init * Workaround for DictConfig not accepting dataframes * Change instantiation of class * Complete docstring
dssg · Feb 1, 2024 · cdff72e · cdff72e
1 parent d7929b9
commit cdff72e
Show file tree

Hide file tree

Showing 4 changed files with 145 additions and 13 deletions.
diff --git a/src/aequitas/flow/__init__.py b/src/aequitas/flow/__init__.py
@@ -1,3 +1,3 @@
-from .experiment import Experiment
+from .experiment import DefaultExperiment, Experiment 
 
-__all__ = ["Experiment"]
+__all__ = ["DefaultExperiment", "Experiment"]
diff --git a/src/aequitas/flow/datasets/generic.py b/src/aequitas/flow/datasets/generic.py
@@ -21,6 +21,7 @@ def __init__(
         self,
         label_column: str,
         sensitive_column: str,
+        df: Optional[pd.DataFrame] = None,
         categorical_columns: list[str] = [],
         dataset_path: Optional[Union[str, Path]] = None,
         train_path: Optional[Union[str, Path]] = None,
@@ -37,6 +38,15 @@ def __init__(
 
         Parameters
         ----------
+        label_column : str
+            The name of the label column in the dataset.
+        sensitive_column : str
+            The name of the sensitive column in the dataset.
+        df : pd.DataFrame, optional
+            The dataset to be used. If None, the dataset will be loaded from the
+            specified paths. Defaults to None.
+        dataset_path : Union[str, Path]
+            The path to the dataset. May be URL.
         train_path : Union[str, Path]
             The path to the training data. May be URL.
         validation_path : Union[str, Path]
@@ -72,6 +82,9 @@ def __init__(
             if url(dataset_path) or Path(dataset_path).exists():
                 self.paths = [dataset_path]
                 self.split_required = True
+        elif df is not None:
+            self.data = df
+            self.split_required = True
         else:
             # Validate if other paths exist
             if not (train_path and validation_path and test_path):
@@ -139,6 +152,8 @@ def _validate_splits(self) -> None:
     def load_data(self) -> None:
         """Load the dataset."""
         self.logger.info("Loading data.")
+        if hasattr(self, "data"):
+            return
         if self.extension == "parquet":
             read_method = pd.read_parquet
         elif self.extension == "csv":

diff --git a/src/aequitas/flow/experiment/default.py b/src/aequitas/flow/experiment/default.py
@@ -1,6 +1,7 @@
 from typing import Literal, Union
 
 from omegaconf import DictConfig
+import pandas as pd
 
 from . import _configs
 from .experiment import Experiment
@@ -9,7 +10,14 @@
 class DefaultExperiment(Experiment):
     def __init__(
         self,
-        dataset_config: Union[DictConfig, dict],
+        df: pd.DataFrame,
+        label_column: str,
+        sensitive_column: str,
+        categorical_columns: list[str] = [],
+        other_dataset_args: dict = None,
+        threshold_type: str = "fixed",
+        score_threshold: float = 0.5,
+        dataset_name: str = "Dataset",
         methods: Union[
             list[Literal["preprocessing", "inprocessing"]], Literal["all"]
         ] = "all",
@@ -24,19 +32,68 @@ def __init__(
 
         Parameters
         ----------
-        dataset_config : Union[DictConfig, dict]
-            Dataset configuration.
+        df : pd.DataFrame
+            Pandas DataFrame with the dataset to be used in the experiment.
+        label_column : str
+            Name of the column containing the label.
+        sensitive_column : str
+            Name of the column containing the sensitive attribute.
+        categorical_columns : list[str], optional
+            List of categorical columns. Defaults to [].
+        other_dataset_args : dict, optional
+            Other arguments to pass to the dataset. Defaults to None.
+        threshold_type : str, optional
+            Threshold type. Defaults to "fixed".
+        score_threshold : float, optional
+            Score threshold. Defaults to 0.5.
+        dataset_name : str, optional
+            Dataset name. Defaults to "Dataset".
         methods : Union[list[Literal["preprocessing", "inprocessing"]], Literal["all"]], optional
             Methods to include in the experiment. If "all", all methods will be included.
             Defaults to "all".
         experiment_size : Literal["test", "small", "medium", "large"], optional
             Experiment size. Defaults to "small".
+        use_baseline : bool, optional
+            Whether to include the baseline methods. Defaults to True.
 
         Raises
         ------
         ValueError
             If the methods or experiment size are not valid.
         """
+        dataset_config = {
+            dataset_name: {
+                "classpath": "aequitas.flow.datasets.GenericDataset",
+                "threshold": {
+                    "type": threshold_type,
+                    "value": score_threshold,
+                },
+                "args": {
+                    "df": df,
+                    "label_column": label_column,
+                    "sensitive_column": sensitive_column,
+                    "categorical_columns": categorical_columns,
+                    **(other_dataset_args or {}),
+                },
+            }
+        }
+
+        config = self._generate_config(
+            dataset_config=dataset_config,
+            methods=methods,
+            experiment_size=experiment_size,
+            use_baseline=use_baseline,
+        )
+
+        super().__init__(config=config)
+
+    @staticmethod
+    def _generate_config(
+        dataset_config: dict,
+        methods: Union[list[Literal["preprocessing", "inprocessing"]], Literal["all"]],
+        experiment_size: Literal["test", "small", "medium", "large"],
+        use_baseline: bool,
+    ):
         # Validate methods:
         if methods == "all":
             default_methods = [
@@ -73,10 +130,50 @@ def __init__(
                 "Invalid experiment_size value. Try one of "
                 f"{['test', 'small', 'medium', 'large']}."
             )
-        # Update experiment config:
-        config = {
+        # Generate experiment config:
+        return {
             "methods": method_configs,
             "datasets": [dataset_config],
             "optimization": experiment_config,
         }
-        super().__init__(config=DictConfig(config))
+
+    @classmethod
+    def from_config(
+        cls,
+        dataset_config: Union[DictConfig, dict],
+        methods: Union[
+            list[Literal["preprocessing", "inprocessing"]], Literal["all"]
+        ] = "all",
+        experiment_size: Literal["test", "small", "medium", "large"] = "small",
+        use_baseline: bool = True,
+    ):
+        """Create a DefaultExperiment from a pandas DataFrame.
+
+        Parameters
+        ----------
+        dataset_config : Union[DictConfig, dict]
+            Dataset configuration.
+        methods : Union[list[Literal["preprocessing", "inprocessing"]], Literal["all"]], optional
+            Methods to include in the experiment. If "all", all methods will be included.
+            Defaults to "all".
+        experiment_size : Literal["test", "small", "medium", "large"], optional
+            Experiment size. Defaults to "small".
+
+        Returns
+        -------
+        DefaultExperiment
+            Default experiment.
+
+        Raises
+        ------
+        ValueError
+            If the methods or experiment size are not valid.
+        """
+        config = cls._generate_config(
+            dataset_config=dataset_config,
+            methods=methods,
+            experiment_size=experiment_size,
+            use_baseline=use_baseline,
+        )
+
+        return super().__init__(config=config)
diff --git a/src/aequitas/flow/experiment/experiment.py b/src/aequitas/flow/experiment/experiment.py
@@ -2,6 +2,7 @@
 import hashlib
 import json
 import pickle
+import pandas as pd
 from pathlib import Path
 from typing import Iterable, Optional, Tuple, Union
 
@@ -54,7 +55,7 @@ class Experiment:
     def __init__(
         self,
         config_file: Optional[Path] = None,
-        config: Optional[DictConfig] = None,
+        config: Optional[Union[DictConfig, dict]] = None,
         default_fields: Iterable[str] = ("methods", "datasets"),
         save_artifacts: bool = True,
         save_folder: Optional[Path] = Path("artifacts"),
@@ -65,9 +66,21 @@ def __init__(
         self.logger = create_logger("Experiment")
         self.logger.info("Instantiating Experiment class.")
 
+        self.dfs = {}
         # Read config file
         if config is not None:
-            self.config = config
+            if isinstance(config, DictConfig):
+                self.config = config
+            else:
+                # check if we have pandas dataframes passed as arguments
+                datasets = config["datasets"]
+                for dataset in datasets:
+                    for name, conf in dataset.items():
+                        if "args" in conf and "df" in conf["args"]:
+                            self.dfs[name] = conf["args"]["df"]
+                            conf["args"]["df"] = None
+                self.config = DictConfig(config)
+
         elif config_file is not None:
             self.config_reader = ConfigReader(
                 config_file, default_fields=default_fields
@@ -104,12 +117,19 @@ def _instantiate_sampler(self) -> BaseSampler:
         return sampler(**self.config.optimization.sampler_args)  # type: ignore
 
     @staticmethod
-    def read_dataset(config: Union[dict, DictConfig]) -> Dataset:
+    def read_dataset(
+        config: Union[dict, DictConfig],
+        df: Optional[pd.DataFrame] = None,
+    ) -> Dataset:
         """Read a dataset from a configuration object."""
         if isinstance(config, dict):
             config = DictConfig(config)
         dataset_class = import_object(config.classpath)
-        dataset_object = dataset_class(**config.args)  # type: ignore
+        # Casting args to dict, to add df if necessary
+        args = dict(config.args)
+        if df is not None:
+            args["df"] = df
+        dataset_object = dataset_class(**args)  # type: ignore
         return dataset_object
 
     @staticmethod
@@ -128,7 +148,7 @@ def _read_datasets(self):
         for dataset in self.config.datasets:
             for name, configs in dataset.items():  # This iterates once.
                 self.logger.debug(f"Reading '{name}'. Configurations: {configs}.")
-                dataset_object = self.read_dataset(configs)
+                dataset_object = self.read_dataset(configs, self.dfs.get(name, None))
                 dataset_object.load_data()
                 dataset_object.create_splits()
                 self.logger.debug(f"Dataset {name} successfully read.")