googleapis · gcf-merge-on-green · Jul 15, 2021 · Jun 8, 2021 · Jun 8, 2021 · Jun 15, 2021
diff --git a/google/cloud/aiplatform/datasets/time_series_dataset.py b/google/cloud/aiplatform/datasets/time_series_dataset.py
@@ -46,7 +46,7 @@ def create(
         encryption_spec_key_name: Optional[str] = None,
         sync: bool = True,
     ) -> "TimeSeriesDataset":
-        """Creates a new tabular dataset.
+        """Creates a new time series dataset.
 
         Args:
             display_name (str):

diff --git a/google/cloud/aiplatform/training_jobs.py b/google/cloud/aiplatform/training_jobs.py
@@ -2524,6 +2524,7 @@ def __init__(
         display_name: str,
         optimization_prediction_type: str,
         optimization_objective: Optional[str] = None,
+        column_specs: Optional[Dict[str, str]] = None,
         column_transformations: Optional[Union[Dict, List[Dict]]] = None,
         optimization_objective_recall_value: Optional[float] = None,
         optimization_objective_precision_value: Optional[float] = None,
@@ -2535,6 +2536,15 @@ def __init__(
     ):
         """Constructs a AutoML Tabular Training Job.
 
+        Example usage:
+
+        job = training_jobs.AutoMLTabularTrainingJob(
+            display_name="my_display_name",
+            optimization_prediction_type="classification",
+            optimization_objective="minimize-log-loss",
+            column_specs=my_column_specs,
+        )
+
         Args:
             display_name (str):
                 Required. The user-defined name of this TrainingPipeline.
@@ -2575,7 +2585,7 @@ def __init__(
                 "minimize-rmse" (default) - Minimize root-mean-squared error (RMSE).
                 "minimize-mae" - Minimize mean-absolute error (MAE).
                 "minimize-rmsle" - Minimize root-mean-squared log error (RMSLE).
-            column_transformations (Optional[Union[Dict, List[Dict]]]):
+            column_specs (Dict[str, str]):
                 Optional. Transformations to apply to the input columns (i.e. columns other
                 than the targetColumn). Each transformation may produce multiple
                 result values from the column's value, and all are used for training.
@@ -2584,6 +2594,17 @@ def __init__(
                 If an input column has no transformations on it, such a column is
                 ignored by the training, except for the targetColumn, which should have
                 no transformations defined on.
+                Only one of column_transformations or column_specs should be passed.
+            column_transformations (Union[Dict, List[Dict]]):
+                Optional. Transformations to apply to the input columns (i.e. columns other
+                than the targetColumn). Each transformation may produce multiple
+                result values from the column's value, and all are used for training.
+                When creating transformation for BigQuery Struct column, the column
+                should be flattened using "." as the delimiter.
+                If an input column has no transformations on it, such a column is
+                ignored by the training, except for the targetColumn, which should have
+                no transformations defined on.
+                Only one of column_transformations or column_specs should be passed.
             optimization_objective_recall_value (float):
                 Optional. Required when maximize-precision-at-recall optimizationObjective was
                 picked, represents the recall value at which the optimization is done.
@@ -2636,7 +2657,20 @@ def __init__(
             training_encryption_spec_key_name=training_encryption_spec_key_name,
             model_encryption_spec_key_name=model_encryption_spec_key_name,
         )
-        self._column_transformations = column_transformations
+        # user populated transformations
+        if column_transformations is not None and column_specs is not None:
+            _LOGGER.info(
+                "column_transformations and column_specs were both passed. column_transformations was used."
+            )
+        if column_transformations is not None:
+            self._column_specs = None
+            self._column_transformations = column_transformations
+        elif column_specs is not None:
+            self._column_specs = column_specs
+            self._column_transformations = None
+        else:
+            self._column_specs = None
+            self._column_transformations = None
         self._optimization_objective = optimization_objective
         self._optimization_prediction_type = optimization_prediction_type
         self._optimization_objective_recall_value = optimization_objective_recall_value
@@ -2854,11 +2888,18 @@ def _run(
 
         Returns:
             model: The trained Vertex AI Model resource or None if training did not
-                produce a Vertex AI Model.
+                produce an Vertex AI Model.
         """
 
         training_task_definition = schema.training_job.definition.automl_tabular
 
+        # convert column specs to column transformations
+        if self._column_specs is not None:
+            self._column_transformations = [
+                {item[1]: {"column_name": item[0]}}
+                for item in self._column_specs.items()
+            ]
+        # auto-populate transformations
         if self._column_transformations is None:
             _LOGGER.info(
                 "No column transformations provided, so now retrieving columns from dataset in order to set default column transformations."
@@ -2869,21 +2910,19 @@ def _run(
                 for column_name in dataset.column_names
                 if column_name != target_column
             ]
-            column_transformations = [
+            self._column_transformations = [
                 {"auto": {"column_name": column_name}} for column_name in column_names
             ]
 
             _LOGGER.info(
                 "The column transformation of type 'auto' was set for the following columns: %s."
                 % column_names
             )
-        else:
-            column_transformations = self._column_transformations
 
         training_task_inputs_dict = {
             # required inputs
             "targetColumn": target_column,
-            "transformations": column_transformations,
+            "transformations": self._column_transformations,
             "trainBudgetMilliNodeHours": budget_milli_node_hours,
             # optional inputs
             "weightColumnName": weight_column,
@@ -2934,6 +2973,50 @@ def _add_additional_experiments(self, additional_experiments: List[str]):
         """
         self._additional_experiments.extend(additional_experiments)
 
+    @staticmethod
+    def get_auto_column_specs(
+        dataset: datasets.TabularDataset, target_column: str,
+    ) -> Dict[str, str]:
+        """Returns a dict with all non-target columns as keys and 'auto' as values.
+
+        Example usage:
+
+        column_specs = training_jobs.AutoMLTabularTrainingJob.get_auto_column_specs(
+            dataset=my_dataset,
+            target_column="my_target_column",
+        )
+
+        Args:
+            dataset (datasets.TabularDataset):
+                Required. Intended dataset.
+            target_column(str):
+                Required. Intended target column.
+        Returns:
+            Dict[str, str]
+                Column names as keys and 'auto' as values
+
+        Raises:
+            RuntimeError: When no valid source is found.
+            ValueError: When target_column is not in dataset
+        """
+        if target_column not in dataset.column_names:
+            raise ValueError("Target column not in dataset.")
+        column_names = [
+            column for column in dataset.column_names if column != target_column
+        ]
+        column_specs = {column: "auto" for column in column_names}
+        return column_specs
+
+    class column_data_types:
+        AUTO = "auto"
+        NUMERIC = "numeric"
+        CATEGORICAL = "categorical"
+        TIMESTAMP = "timestamp"
+        TEXT = "text"
+        REPEATED_NUMERIC = "repeated_numeric"
+        REPEATED_CATEGORICAL = "repeated_categorical"
+        REPEATED_TEXT = "repeated_text"
+
 
 class AutoMLForecastingTrainingJob(_TrainingJob):
     _supported_training_schemas = (schema.training_job.definition.automl_forecasting,)