[RELEASE] v0.2.1 (#475)

* [FIX] Documentation and docker workflow file (#449) * fixes to documentation and docker * fix to docker * Apply suggestions from code review * add change log for release (#450) * [FIX] release docs (#452) * Release 0.2 * Release 0.2.0 * fix docs new line * [FIX] ADD forecasting init design to pip data files (#459) * add forecasting_init.json to data files under setup * avoid undefined reference in scale_value * checks for time series dataset split (#464) * checks for time series dataset split * maint * Update autoPyTorch/datasets/time_series_dataset.py Co-authored-by: Ravin Kohli <13005107+ravinkohli@users.noreply.github.com> Co-authored-by: Ravin Kohli <13005107+ravinkohli@users.noreply.github.com> * [FIX] Numerical stability scaling for timeseries forecasting tasks (#467) * resolve rebase conflict * add checks for scaling factors * flake8 fix * resolve conflict * [FIX] pipeline options in `fit_pipeline` (#466) * fix update of pipeline config options in fit pipeline * fix flake and test * suggestions from review * [FIX] results management and visualisation with missing test data (#465) * add flexibility to avoid checking for test scores * fix flake and test * fix bug in tests * suggestions from review * [ADD] Robustly refit models in final ensemble in parallel (#471) * add parallel model runner and update running traditional classifiers * update pipeline config to pipeline options * working refit function * fix mypy and flake * suggestions from review * fix mypy and flake * suggestions from review * finish documentation * fix tests * add test for parallel model runner * fix flake * fix tests * fix traditional prediction for refit * suggestions from review * add warning for failed processing of results * remove unnecessary change * update autopytorch version number * update autopytorch version number and the example file * [DOCS] Release notes v0.2.1 (#476) * Release 0.2.1 * add release docs * Update docs/releases.rst Co-authored-by: Difan Deng <33290713+dengdifan@users.noreply.github.com>
automl · Aug 23, 2022 · 56a2ac1 · 56a2ac1
1 parent c138dff
commit 56a2ac1
Show file tree

Hide file tree

Showing 37 changed files with 2,814 additions and 306 deletions.
diff --git a/autoPyTorch/__version__.py b/autoPyTorch/__version__.py
@@ -1,4 +1,4 @@
 """Version information."""
 
 # The following line *must* be the last in the module, exactly as formatted:
-__version__ = "0.2"
+__version__ = "0.2.1"
diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
diff --git a/autoPyTorch/constants.py b/autoPyTorch/constants.py
@@ -58,6 +58,10 @@
                                          "forecasting tasks! Please run \n pip install autoPyTorch[forecasting] \n to "\
                                          "install the corresponding dependencies!"
 
+# This value is applied to ensure numerical stability: Sometimes we want to rescale some values: value / scale.
+# We make the scale value to be 1 if it is smaller than this value to ensure that the scaled value will not resutl in
+# overflow
+VERY_SMALL_VALUE = 1e-12
 
 # The constant values for time series forecasting comes from
 # https://github.com/rakshitha123/TSForecasting/blob/master/experiments/deep_learning_experiments.py
@@ -78,3 +82,6 @@
 
 # To avoid that we get a sequence that is too long to be fed to a network
 MAX_WINDOW_SIZE_BASE = 500
+
+# AutoPyTorch optionally allows network inference or metrics calculation for the following datasets
+OPTIONAL_INFERENCE_CHOICES = ('test',)
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
@@ -693,6 +693,17 @@ def __init__(self,
 
         self.splits = self.get_splits_from_resampling_strategy()  # type: ignore[assignment]
 
+        valid_splits = []
+        for i, split in enumerate(self.splits):
+            if len(split[0]) > 0:
+                valid_splits.append(split)
+
+        if len(valid_splits) == 0:
+            raise ValueError(f'The passed value for {n_prediction_steps} is unsuited for the current dataset, please '
+                             'consider reducing n_prediction_steps')
+
+        self.splits = valid_splits
+
         # TODO doing experiments to give the most proper way of defining these two values
         if lagged_value is None:
             try:

diff --git a/autoPyTorch/ensemble/abstract_ensemble.py b/autoPyTorch/ensemble/abstract_ensemble.py
@@ -9,6 +9,9 @@
 class AbstractEnsemble(object):
     __metaclass__ = ABCMeta
 
+    def __init__(self):
+        self.identifiers_: List[Tuple[int, int, float]] = []
+
     @abstractmethod
     def fit(
         self,
@@ -76,3 +79,12 @@ def get_validation_performance(self) -> float:
         Returns:
             Score
         """
+
+    def update_identifiers(
+        self,
+        replace_identifiers_mapping: Dict[Tuple[int, int, float], Tuple[int, int, float]]
+    ) -> None:
+        identifiers = self.identifiers_.copy()
+        for i, identifier in enumerate(self.identifiers_):
+            identifiers[i] = replace_identifiers_mapping.get(identifier, identifier)
+        self.identifiers_ = identifiers
diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py
@@ -195,7 +195,8 @@ def get_additional_run_info(self) -> Dict[str, Any]:
                     Can be found in autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs
         """
         return {'pipeline_configuration': self.configuration,
-                'trainer_configuration': self.pipeline.named_steps['model_trainer'].choice.model.get_config()}
+                'trainer_configuration': self.pipeline.named_steps['model_trainer'].choice.model.get_config(),
+                'configuration_origin': 'traditional'}
 
     def get_pipeline_representation(self) -> Dict[str, str]:
         return self.pipeline.get_pipeline_representation()
@@ -347,7 +348,7 @@ class AbstractEvaluator(object):
 
     An evaluator is an object that:
         + constructs a pipeline (i.e. a classification or regression estimator) for a given
-          pipeline_config and run settings (budget, seed)
+          pipeline_options and run settings (budget, seed)
         + Fits and trains this pipeline (TrainEvaluator) or tests a given
           configuration (TestEvaluator)
 
@@ -369,7 +370,7 @@ class AbstractEvaluator(object):
             The amount of epochs/time a configuration is allowed to run.
         budget_type  (str):
             The budget type. Currently, only epoch and time are allowed.
-        pipeline_config (Optional[Dict[str, Any]]):
+        pipeline_options (Optional[Dict[str, Any]]):
             Defines the content of the pipeline being evaluated. For example, it
             contains pipeline specific settings like logging name, or whether or not
             to use tensorboard.
@@ -430,7 +431,7 @@ def __init__(self, backend: Backend,
                  budget: float,
                  configuration: Union[int, str, Configuration],
                  budget_type: str = None,
-                 pipeline_config: Optional[Dict[str, Any]] = None,
+                 pipeline_options: Optional[Dict[str, Any]] = None,
                  seed: int = 1,
                  output_y_hat_optimization: bool = True,
                  num_run: Optional[int] = None,
@@ -523,10 +524,10 @@ def __init__(self, backend: Backend,
         self._init_params = init_params
 
         assert self.pipeline_class is not None, "Could not infer pipeline class"
-        pipeline_config = pipeline_config if pipeline_config is not None \
+        pipeline_options = pipeline_options if pipeline_options is not None \
             else self.pipeline_class.get_default_pipeline_options()
-        self.budget_type = pipeline_config['budget_type'] if budget_type is None else budget_type
-        self.budget = pipeline_config[self.budget_type] if budget == 0 else budget
+        self.budget_type = pipeline_options['budget_type'] if budget_type is None else budget_type
+        self.budget = pipeline_options[self.budget_type] if budget == 0 else budget
 
         self.num_run = 0 if num_run is None else num_run
 
@@ -539,7 +540,7 @@ def __init__(self, backend: Backend,
             port=logger_port,
         )
 
-        self._init_fit_dictionary(logger_port=logger_port, pipeline_config=pipeline_config, metrics_dict=metrics_dict)
+        self._init_fit_dictionary(logger_port=logger_port, pipeline_options=pipeline_options, metrics_dict=metrics_dict)
         self.Y_optimization: Optional[np.ndarray] = None
         self.Y_actual_train: Optional[np.ndarray] = None
         self.pipelines: Optional[List[BaseEstimator]] = None
@@ -597,7 +598,7 @@ def _init_datamanager_info(
     def _init_fit_dictionary(
         self,
         logger_port: int,
-        pipeline_config: Dict[str, Any],
+        pipeline_options: Dict[str, Any],
         metrics_dict: Optional[Dict[str, List[str]]] = None,
     ) -> None:
         """
@@ -608,7 +609,7 @@ def _init_fit_dictionary(
                 Logging is performed using a socket-server scheme to be robust against many
                 parallel entities that want to write to the same file. This integer states the
                 socket port for the communication channel.
-            pipeline_config (Dict[str, Any]):
+            pipeline_options (Dict[str, Any]):
                 Defines the content of the pipeline being evaluated. For example, it
                 contains pipeline specific settings like logging name, or whether or not
                 to use tensorboard.
@@ -634,7 +635,7 @@ def _init_fit_dictionary(
             'optimize_metric': self.metric.name
         })
 
-        self.fit_dictionary.update(pipeline_config)
+        self.fit_dictionary.update(pipeline_options)
         # If the budget is epochs, we want to limit that in the fit dictionary
         if self.budget_type == 'epochs':
             self.fit_dictionary['epochs'] = self.budget
@@ -805,6 +806,11 @@ def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float],
         if test_loss is not None:
             additional_run_info['test_loss'] = test_loss
 
+        # Add information to additional info that can be useful for other functionalities
+        additional_run_info['configuration'] = self.configuration \
+            if not isinstance(self.configuration, Configuration) else self.configuration.get_dictionary()
+        additional_run_info['budget'] = self.budget
+
         rval_dict = {'loss': cost,
                      'additional_run_info': additional_run_info,
                      'status': status}

diff --git a/autoPyTorch/evaluation/tae.py b/autoPyTorch/evaluation/tae.py
@@ -123,7 +123,7 @@ def __init__(
         abort_on_first_run_crash: bool,
         pynisher_context: str,
         multi_objectives: List[str],
-        pipeline_config: Optional[Dict[str, Any]] = None,
+        pipeline_options: Optional[Dict[str, Any]] = None,
         initial_num_run: int = 1,
         stats: Optional[Stats] = None,
         run_obj: str = 'quality',
@@ -198,13 +198,13 @@ def __init__(
         self.disable_file_output = disable_file_output
         self.init_params = init_params
 
-        self.budget_type = pipeline_config['budget_type'] if pipeline_config is not None else budget_type
+        self.budget_type = pipeline_options['budget_type'] if pipeline_options is not None else budget_type
 
-        self.pipeline_config: Dict[str, Union[int, str, float]] = dict()
-        if pipeline_config is None:
-            pipeline_config = replace_string_bool_to_bool(json.load(open(
+        self.pipeline_options: Dict[str, Union[int, str, float]] = dict()
+        if pipeline_options is None:
+            pipeline_options = replace_string_bool_to_bool(json.load(open(
                 os.path.join(os.path.dirname(__file__), '../configs/default_pipeline_options.json'))))
-        self.pipeline_config.update(pipeline_config)
+        self.pipeline_options.update(pipeline_options)
 
         self.logger_port = logger_port
         if self.logger_port is None:
@@ -225,7 +225,7 @@ def __init__(
     def _check_and_get_default_budget(self) -> float:
         budget_type_choices_tabular = ('epochs', 'runtime')
         budget_choices = {
-            budget_type: float(self.pipeline_config.get(budget_type, np.inf))
+            budget_type: float(self.pipeline_options.get(budget_type, np.inf))
             for budget_type in budget_type_choices_tabular
         }
 
@@ -234,7 +234,7 @@ def _check_and_get_default_budget(self) -> float:
         budget_type_choices = budget_type_choices_tabular + FORECASTING_BUDGET_TYPE
 
         # budget is defined by epochs by default
-        budget_type = str(self.pipeline_config.get('budget_type', 'epochs'))
+        budget_type = str(self.pipeline_options.get('budget_type', 'epochs'))
         if self.budget_type is not None:
             budget_type = self.budget_type
 
@@ -361,7 +361,7 @@ def run(
             init_params=init_params,
             budget=budget,
             budget_type=self.budget_type,
-            pipeline_config=self.pipeline_config,
+            pipeline_options=self.pipeline_options,
             logger_port=self.logger_port,
             all_supported_metrics=self.all_supported_metrics,
             search_space_updates=self.search_space_updates

diff --git a/autoPyTorch/evaluation/test_evaluator.py b/autoPyTorch/evaluation/test_evaluator.py
@@ -51,7 +51,7 @@ class TestEvaluator(AbstractEvaluator):
             The amount of epochs/time a configuration is allowed to run.
         budget_type  (str):
             The budget type, which can be epochs or time
-        pipeline_config (Optional[Dict[str, Any]]):
+        pipeline_options (Optional[Dict[str, Any]]):
             Defines the content of the pipeline being evaluated. For example, it
             contains pipeline specific settings like logging name, or whether or not
             to use tensorboard.
@@ -113,7 +113,7 @@ def __init__(
         budget: float,
         configuration: Union[int, str, Configuration],
         budget_type: str = None,
-        pipeline_config: Optional[Dict[str, Any]] = None,
+        pipeline_options: Optional[Dict[str, Any]] = None,
         seed: int = 1,
         output_y_hat_optimization: bool = False,
         num_run: Optional[int] = None,
@@ -141,7 +141,7 @@ def __init__(
             budget_type=budget_type,
             logger_port=logger_port,
             all_supported_metrics=all_supported_metrics,
-            pipeline_config=pipeline_config,
+            pipeline_options=pipeline_options,
             search_space_updates=search_space_updates
         )
 
@@ -206,7 +206,7 @@ def eval_test_function(
     include: Optional[Dict[str, Any]],
     exclude: Optional[Dict[str, Any]],
     disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
-    pipeline_config: Optional[Dict[str, Any]] = None,
+    pipeline_options: Optional[Dict[str, Any]] = None,
     budget_type: str = None,
     init_params: Optional[Dict[str, Any]] = None,
     logger_port: Optional[int] = None,
@@ -230,7 +230,7 @@ def eval_test_function(
         budget_type=budget_type,
         logger_port=logger_port,
         all_supported_metrics=all_supported_metrics,
-        pipeline_config=pipeline_config,
+        pipeline_options=pipeline_options,
         search_space_updates=search_space_updates)
 
     evaluator.fit_predict_and_loss()
diff --git a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
@@ -40,7 +40,7 @@ class TimeSeriesForecastingTrainEvaluator(TrainEvaluator):
             The amount of epochs/time a configuration is allowed to run.
         budget_type  (str):
             The budget type, which can be epochs or time
-        pipeline_config (Optional[Dict[str, Any]]):
+        pipeline_options (Optional[Dict[str, Any]]):
             Defines the content of the pipeline being evaluated. For example, it
             contains pipeline specific settings like logging name, or whether or not
             to use tensorboard.
@@ -106,7 +106,7 @@ def __init__(self, backend: Backend, queue: Queue,
                  metric: autoPyTorchMetric,
                  budget: float,
                  budget_type: str = None,
-                 pipeline_config: Optional[Dict[str, Any]] = None,
+                 pipeline_options: Optional[Dict[str, Any]] = None,
                  configuration: Optional[Configuration] = None,
                  seed: int = 1,
                  output_y_hat_optimization: bool = True,
@@ -138,7 +138,7 @@ def __init__(self, backend: Backend, queue: Queue,
             logger_port=logger_port,
             keep_models=keep_models,
             all_supported_metrics=all_supported_metrics,
-            pipeline_config=pipeline_config,
+            pipeline_options=pipeline_options,
             search_space_updates=search_space_updates
         )
         self.datamanager = backend.load_datamanager()
@@ -456,7 +456,7 @@ def forecasting_eval_train_function(
     include: Optional[Dict[str, Any]],
     exclude: Optional[Dict[str, Any]],
     disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
-    pipeline_config: Optional[Dict[str, Any]] = None,
+    pipeline_options: Optional[Dict[str, Any]] = None,
     budget_type: str = None,
     init_params: Optional[Dict[str, Any]] = None,
     logger_port: Optional[int] = None,
@@ -490,7 +490,7 @@ def forecasting_eval_train_function(
             The amount of epochs/time a configuration is allowed to run.
         budget_type  (str):
             The budget type, which can be epochs or time
-        pipeline_config (Optional[Dict[str, Any]]):
+        pipeline_options (Optional[Dict[str, Any]]):
             Defines the content of the pipeline being evaluated. For example, it
             contains pipeline specific settings like logging name, or whether or not
             to use tensorboard.
@@ -550,7 +550,7 @@ def forecasting_eval_train_function(
         budget_type=budget_type,
         logger_port=logger_port,
         all_supported_metrics=all_supported_metrics,
-        pipeline_config=pipeline_config,
+        pipeline_options=pipeline_options,
         search_space_updates=search_space_updates,
         max_budget=max_budget,
         min_num_test_instances=min_num_test_instances,

diff --git a/autoPyTorch/evaluation/train_evaluator.py b/autoPyTorch/evaluation/train_evaluator.py
@@ -60,7 +60,7 @@ class TrainEvaluator(AbstractEvaluator):
             The amount of epochs/time a configuration is allowed to run.
         budget_type  (str):
             The budget type, which can be epochs or time
-        pipeline_config (Optional[Dict[str, Any]]):
+        pipeline_options (Optional[Dict[str, Any]]):
             Defines the content of the pipeline being evaluated. For example, it
             contains pipeline specific settings like logging name, or whether or not
             to use tensorboard.
@@ -121,7 +121,7 @@ def __init__(self, backend: Backend, queue: Queue,
                  budget: float,
                  configuration: Union[int, str, Configuration],
                  budget_type: str = None,
-                 pipeline_config: Optional[Dict[str, Any]] = None,
+                 pipeline_options: Optional[Dict[str, Any]] = None,
                  seed: int = 1,
                  output_y_hat_optimization: bool = True,
                  num_run: Optional[int] = None,
@@ -149,7 +149,7 @@ def __init__(self, backend: Backend, queue: Queue,
             budget_type=budget_type,
             logger_port=logger_port,
             all_supported_metrics=all_supported_metrics,
-            pipeline_config=pipeline_config,
+            pipeline_options=pipeline_options,
             search_space_updates=search_space_updates
         )
 
@@ -420,7 +420,7 @@ def eval_train_function(
     include: Optional[Dict[str, Any]],
     exclude: Optional[Dict[str, Any]],
     disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
-    pipeline_config: Optional[Dict[str, Any]] = None,
+    pipeline_options: Optional[Dict[str, Any]] = None,
     budget_type: str = None,
     init_params: Optional[Dict[str, Any]] = None,
     logger_port: Optional[int] = None,
@@ -452,7 +452,7 @@ def eval_train_function(
             The amount of epochs/time a configuration is allowed to run.
         budget_type  (str):
             The budget type, which can be epochs or time
-        pipeline_config (Optional[Dict[str, Any]]):
+        pipeline_options (Optional[Dict[str, Any]]):
             Defines the content of the pipeline being evaluated. For example, it
             contains pipeline specific settings like logging name, or whether or not
             to use tensorboard.
@@ -506,7 +506,7 @@ def eval_train_function(
         budget_type=budget_type,
         logger_port=logger_port,
         all_supported_metrics=all_supported_metrics,
-        pipeline_config=pipeline_config,
+        pipeline_options=pipeline_options,
         search_space_updates=search_space_updates,
     )
     evaluator.fit_predict_and_loss()