From f935b764a8ce123fa60a1793080f6594ee952427 Mon Sep 17 00:00:00 2001
From: sirtorry <torryyang@google.com>
Date: Tue, 8 Jun 2021 01:50:49 -0400
Subject: [PATCH 01/22] add transformation_specs

---
 google/cloud/aiplatform/column.py             | 28 ++++++++++++
 .../aiplatform/datasets/tabular_dataset.py    | 22 +++++++++-
 .../datasets/time_series_dataset.py           |  3 +-
 google/cloud/aiplatform/training_jobs.py      | 43 +++++++++++++++++--
 4 files changed, 91 insertions(+), 5 deletions(-)
 create mode 100644 google/cloud/aiplatform/column.py

diff --git a/google/cloud/aiplatform/column.py b/google/cloud/aiplatform/column.py
new file mode 100644
index 0000000000..aa13e4d609
--- /dev/null
+++ b/google/cloud/aiplatform/column.py
@@ -0,0 +1,28 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2021 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""Vertex AI tabular data types"""
+
+class data_types:
+    AUTO = "auto"
+    NUMERIC = "numeric"
+    CATEGORICAL = "categorical"
+    TIMESTAMP = "timestamp"
+    TEXT = "text"
+    REPEATED_NUMERIC = "repeated_numeric"
+    REPEATED_CATEGORICAL = "repeated_categorical"
+    REPEATED_TEXT = "repeated_text"
diff --git a/google/cloud/aiplatform/datasets/tabular_dataset.py b/google/cloud/aiplatform/datasets/tabular_dataset.py
index 95f1b16f98..240a3bcb23 100644
--- a/google/cloud/aiplatform/datasets/tabular_dataset.py
+++ b/google/cloud/aiplatform/datasets/tabular_dataset.py
@@ -18,7 +18,7 @@
 import csv
 import logging
 
-from typing import List, Optional, Sequence, Tuple, Union
+from typing import Dict, List, Optional, Sequence, Tuple, Union
 
 from google.auth import credentials as auth_credentials
 
@@ -39,6 +39,26 @@ class TabularDataset(datasets._Dataset):
         schema.dataset.metadata.tabular,
     )
 
+    def auto_column_specs(self, target_column: str) -> Dict[str, str]:
+        """Returns a dict with all non-target columns as keys and 'auto' as values.
+        Args:
+            target_column(str):
+                Required. Intended target column.
+        Returns:
+            Dict[str, str]
+                Column names as keys and 'auto' as values
+
+        Raises:
+            RuntimeError: When no valid source is found.
+        """
+        column_names = [
+            column
+            for column in self.column_names
+            if column != target_column
+        ]
+        column_specs = {column: 'auto' for column in column_names}
+        return column_specs
+
     @property
     def column_names(self) -> List[str]:
         """Retrieve the columns for the dataset by extracting it from the Google Cloud Storage or
diff --git a/google/cloud/aiplatform/datasets/time_series_dataset.py b/google/cloud/aiplatform/datasets/time_series_dataset.py
index d5aa3dcbf2..dce0f5ec91 100644
--- a/google/cloud/aiplatform/datasets/time_series_dataset.py
+++ b/google/cloud/aiplatform/datasets/time_series_dataset.py
@@ -26,6 +26,7 @@
 from google.cloud.aiplatform import utils
 
 
+#TODO: extend tabular dataset
 class TimeSeriesDataset(datasets._Dataset):
     """Managed time series dataset resource for Vertex AI"""
 
@@ -46,7 +47,7 @@ def create(
         encryption_spec_key_name: Optional[str] = None,
         sync: bool = True,
     ) -> "TimeSeriesDataset":
-        """Creates a new tabular dataset.
+        """Creates a new d dataset.
 
         Args:
             display_name (str):
diff --git a/google/cloud/aiplatform/training_jobs.py b/google/cloud/aiplatform/training_jobs.py
index 51fdb55d13..87bfc98cd6 100644
--- a/google/cloud/aiplatform/training_jobs.py
+++ b/google/cloud/aiplatform/training_jobs.py
@@ -2301,6 +2301,7 @@ def __init__(
         optimization_prediction_type: str,
         optimization_objective: Optional[str] = None,
         column_transformations: Optional[Union[Dict, List[Dict]]] = None,
+        column_specs: Optional[Dict[str, str]] = None,
         optimization_objective_recall_value: Optional[float] = None,
         optimization_objective_precision_value: Optional[float] = None,
         project: Optional[str] = None,
@@ -2360,6 +2361,17 @@ def __init__(
                 If an input column has no transformations on it, such a column is
                 ignored by the training, except for the targetColumn, which should have
                 no transformations defined on.
+                Only one of column_transformations or column_specs should be passed.
+            column_specs (Optional[Dict[str, str]]):
+                Optional. Transformations to apply to the input columns (i.e. columns other
+                than the targetColumn). Each transformation may produce multiple
+                result values from the column's value, and all are used for training.
+                When creating transformation for BigQuery Struct column, the column
+                should be flattened using "." as the delimiter.
+                If an input column has no transformations on it, such a column is
+                ignored by the training, except for the targetColumn, which should have
+                no transformations defined on.
+                Only one of column_transformations or column_specs should be passed.
             optimization_objective_recall_value (float):
                 Optional. Required when maximize-precision-at-recall optimizationObjective was
                 picked, represents the recall value at which the optimization is done.
@@ -2413,6 +2425,7 @@ def __init__(
             model_encryption_spec_key_name=model_encryption_spec_key_name,
         )
         self._column_transformations = column_transformations
+        self._column_specs = column_specs
         self._optimization_objective = optimization_objective
         self._optimization_prediction_type = optimization_prediction_type
         self._optimization_objective_recall_value = optimization_objective_recall_value
@@ -2627,11 +2640,36 @@ def _run(
         Returns:
             model: The trained Vertex AI Model resource or None if training did not
                 produce an Vertex AI Model.
+        Raises:
+            ValueError: When column doesn't exist in dataset.
+            ValueError: When target column is in transformations.
         """
 
         training_task_definition = schema.training_job.definition.automl_tabular
+        column_transformations = None
 
-        if self._column_transformations is None:
+        # user populated transformations
+        if self._column_transformations is not None and self._column_specs is not None:
+            _LOGGER.info(
+                "column_transformations and column_specs were both passed. column_transformations was used."
+            )
+        if self._column_transformations is not None:
+            column_transformations = self._column_transformations
+        if self._column_specs is not None and column_transformations is None:
+            column_transformations = [
+                {self._column_specs[column]: {"column_name": column}} for column in self._column_specs
+            ]
+        if column_transformations is not None:
+            column_names = dataset.column_names
+            for transformation in column_transformations:
+                for data_type in transformation:
+                    column = transformation[data_type][column_name]
+                    if column not in column_names:
+                        raise ValueError(f"'{column}' is not in the dataset.")
+                    if column is target_column:
+                        raise ValueError("Target column is in transformations.")
+        # auto-populate transformations
+        if column_transformations is None:
             _LOGGER.info(
                 "No column transformations provided, so now retrieving columns from dataset in order to set default column transformations."
             )
@@ -2649,8 +2687,6 @@ def _run(
                 "The column transformation of type 'auto' was set for the following columns: %s."
                 % column_names
             )
-        else:
-            column_transformations = self._column_transformations
 
         training_task_inputs_dict = {
             # required inputs
@@ -2707,6 +2743,7 @@ def _add_additional_experiments(self, additional_experiments: List[str]):
         self._additional_experiments.extend(additional_experiments)
 
 
+#TODO: add tabular sugar to forecasting
 class AutoMLForecastingTrainingJob(_TrainingJob):
     _supported_training_schemas = (schema.training_job.definition.automl_forecasting,)
 

From f6edc10baf6ea7d293031663d444705e06facc4e Mon Sep 17 00:00:00 2001
From: sirtorry <torryyang@google.com>
Date: Tue, 8 Jun 2021 16:01:25 -0400
Subject: [PATCH 02/22] add tests

---
 .../aiplatform/datasets/tabular_dataset.py    |  3 ++
 .../test_automl_tabular_training_jobs.py      |  6 +++
 tests/unit/aiplatform/test_datasets.py        | 38 +++++++++++++++++++
 3 files changed, 47 insertions(+)

diff --git a/google/cloud/aiplatform/datasets/tabular_dataset.py b/google/cloud/aiplatform/datasets/tabular_dataset.py
index 240a3bcb23..d630073642 100644
--- a/google/cloud/aiplatform/datasets/tabular_dataset.py
+++ b/google/cloud/aiplatform/datasets/tabular_dataset.py
@@ -50,7 +50,10 @@ def auto_column_specs(self, target_column: str) -> Dict[str, str]:
 
         Raises:
             RuntimeError: When no valid source is found.
+            ValueError: When target_column is not in dataset
         """
+        if target_column is not in self.column_names:
+            raise ValueError("Target column not in dataset.")
         column_names = [
             column
             for column in self.column_names
diff --git a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
index 761b03b5a0..74fdc94bdf 100644
--- a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
+++ b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
@@ -50,6 +50,12 @@
     {"auto": {"column_name": "petal_length"}},
     {"auto": {"column_name": "petal_width"}},
 ]
+__TEST_TRAINING_COLUMN_SPECS = {
+    "sepal_width": "auto",
+    "sepal_length": "auto",
+    "sepal_width": "auto",
+    "sepal_width": "auto",
+}
 _TEST_TRAINING_TARGET_COLUMN = "target"
 _TEST_TRAINING_BUDGET_MILLI_NODE_HOURS = 1000
 _TEST_TRAINING_WEIGHT_COLUMN = "weight"
diff --git a/tests/unit/aiplatform/test_datasets.py b/tests/unit/aiplatform/test_datasets.py
index 4c2a75c393..7bb2215491 100644
--- a/tests/unit/aiplatform/test_datasets.py
+++ b/tests/unit/aiplatform/test_datasets.py
@@ -1005,6 +1005,44 @@ def test_tabular_dataset_column_name_bigquery(self):
 
         assert my_dataset.column_names == ["column_1", "column_2"]
 
+    @pytest.mark.usefixtures(
+        "get_dataset_tabular_gcs_mock", "gcs_client_download_as_bytes_mock"
+    )
+    def test_tabular_dataset_column_name_gcs(self):
+        my_dataset = datasets.TabularDataset(dataset_name=_TEST_NAME)
+
+        with pytest.raises(ValueError):
+            my_dataset.auto_column_specs("column_3")
+
+    @pytest.mark.usefixtures(
+        "get_dataset_tabular_gcs_mock", "gcs_client_download_as_bytes_mock"
+    )
+    def test_tabular_dataset_column_name_gcs(self):
+        my_dataset = datasets.TabularDataset(dataset_name=_TEST_NAME)
+
+        assert my_dataset.auto_column_specs("column_2") == {"column_1": "auto"}
+
+    @pytest.mark.usefixtures(
+        "get_dataset_tabular_bq_mock",
+        "bigquery_client_mock",
+        "bigquery_table_schema_mock",
+    )
+    def test_tabular_dataset_column_name_bigquery(self):
+        my_dataset = datasets.TabularDataset(dataset_name=_TEST_NAME)
+
+        with pytest.raises(ValueError):
+            my_dataset.auto_column_specs("column_3")
+
+    @pytest.mark.usefixtures(
+        "get_dataset_tabular_bq_mock",
+        "bigquery_client_mock",
+        "bigquery_table_schema_mock",
+    )
+    def test_tabular_dataset_column_name_bigquery(self):
+        my_dataset = datasets.TabularDataset(dataset_name=_TEST_NAME)
+
+        assert my_dataset.auto_column_specs("column_2") == {"column_1": "auto"}
+
 
 class TestTextDataset:
     def setup_method(self):

From 619c04aee639858dd1586156a061748971de393d Mon Sep 17 00:00:00 2001
From: sirtorry <torryyang@google.com>
Date: Tue, 15 Jun 2021 01:31:25 -0400
Subject: [PATCH 03/22] address feeback

---
 google/cloud/aiplatform/column.py             |  1 +
 .../aiplatform/datasets/tabular_dataset.py    | 25 +-----------
 .../datasets/time_series_dataset.py           |  2 +-
 google/cloud/aiplatform/training_jobs.py      | 33 ++++++++++++++--
 .../test_automl_tabular_training_jobs.py      | 19 +++++++++-
 tests/unit/aiplatform/test_datasets.py        | 38 -------------------
 6 files changed, 50 insertions(+), 68 deletions(-)

diff --git a/google/cloud/aiplatform/column.py b/google/cloud/aiplatform/column.py
index aa13e4d609..159b3cf415 100644
--- a/google/cloud/aiplatform/column.py
+++ b/google/cloud/aiplatform/column.py
@@ -17,6 +17,7 @@
 
 """Vertex AI tabular data types"""
 
+
 class data_types:
     AUTO = "auto"
     NUMERIC = "numeric"
diff --git a/google/cloud/aiplatform/datasets/tabular_dataset.py b/google/cloud/aiplatform/datasets/tabular_dataset.py
index d630073642..95f1b16f98 100644
--- a/google/cloud/aiplatform/datasets/tabular_dataset.py
+++ b/google/cloud/aiplatform/datasets/tabular_dataset.py
@@ -18,7 +18,7 @@
 import csv
 import logging
 
-from typing import Dict, List, Optional, Sequence, Tuple, Union
+from typing import List, Optional, Sequence, Tuple, Union
 
 from google.auth import credentials as auth_credentials
 
@@ -39,29 +39,6 @@ class TabularDataset(datasets._Dataset):
         schema.dataset.metadata.tabular,
     )
 
-    def auto_column_specs(self, target_column: str) -> Dict[str, str]:
-        """Returns a dict with all non-target columns as keys and 'auto' as values.
-        Args:
-            target_column(str):
-                Required. Intended target column.
-        Returns:
-            Dict[str, str]
-                Column names as keys and 'auto' as values
-
-        Raises:
-            RuntimeError: When no valid source is found.
-            ValueError: When target_column is not in dataset
-        """
-        if target_column is not in self.column_names:
-            raise ValueError("Target column not in dataset.")
-        column_names = [
-            column
-            for column in self.column_names
-            if column != target_column
-        ]
-        column_specs = {column: 'auto' for column in column_names}
-        return column_specs
-
     @property
     def column_names(self) -> List[str]:
         """Retrieve the columns for the dataset by extracting it from the Google Cloud Storage or
diff --git a/google/cloud/aiplatform/datasets/time_series_dataset.py b/google/cloud/aiplatform/datasets/time_series_dataset.py
index dce0f5ec91..f9f9b865d8 100644
--- a/google/cloud/aiplatform/datasets/time_series_dataset.py
+++ b/google/cloud/aiplatform/datasets/time_series_dataset.py
@@ -26,7 +26,7 @@
 from google.cloud.aiplatform import utils
 
 
-#TODO: extend tabular dataset
+# TODO: extend tabular dataset
 class TimeSeriesDataset(datasets._Dataset):
     """Managed time series dataset resource for Vertex AI"""
 
diff --git a/google/cloud/aiplatform/training_jobs.py b/google/cloud/aiplatform/training_jobs.py
index 87bfc98cd6..3179686137 100644
--- a/google/cloud/aiplatform/training_jobs.py
+++ b/google/cloud/aiplatform/training_jobs.py
@@ -2657,13 +2657,14 @@ def _run(
             column_transformations = self._column_transformations
         if self._column_specs is not None and column_transformations is None:
             column_transformations = [
-                {self._column_specs[column]: {"column_name": column}} for column in self._column_specs
+                {self._column_specs[column]: {"column_name": column}}
+                for column in self._column_specs
             ]
         if column_transformations is not None:
             column_names = dataset.column_names
             for transformation in column_transformations:
                 for data_type in transformation:
-                    column = transformation[data_type][column_name]
+                    column = transformation[data_type]
                     if column not in column_names:
                         raise ValueError(f"'{column}' is not in the dataset.")
                     if column is target_column:
@@ -2742,8 +2743,34 @@ def _add_additional_experiments(self, additional_experiments: List[str]):
         """
         self._additional_experiments.extend(additional_experiments)
 
+    @classmethod
+    def get_auto_column_specs(
+        self, dataset: datasets.TabularDataset, target_column: str,
+    ) -> Dict[str, str]:
+        """Returns a dict with all non-target columns as keys and 'auto' as values.
+        Args:
+            dataset (datasets.TabularDataset):
+                Required. Intended dataset.
+            target_column(str):
+                Required. Intended target column.
+        Returns:
+            Dict[str, str]
+                Column names as keys and 'auto' as values
+
+        Raises:
+            RuntimeError: When no valid source is found.
+            ValueError: When target_column is not in dataset
+        """
+        if target_column not in dataset.column_names:
+            raise ValueError("Target column not in dataset.")
+        column_names = [
+            column for column in dataset.column_names if column != target_column
+        ]
+        column_specs = {column: "auto" for column in column_names}
+        return column_specs
+
 
-#TODO: add tabular sugar to forecasting
+# TODO: add tabular sugar to forecasting
 class AutoMLForecastingTrainingJob(_TrainingJob):
     _supported_training_schemas = (schema.training_job.definition.automl_forecasting,)
 
diff --git a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
index 74fdc94bdf..1e6671c6e5 100644
--- a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
+++ b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
@@ -233,11 +233,20 @@ def test_run_call_pipeline_service_create(
             encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME,
         )
 
+        column_specs = training_jobs.AutoMLTabularTrainingJob.get_auto_column_specs(
+            dataset=mock_dataset_tabular, target_column=_TEST_TRAINING_TARGET_COLUMN,
+        )
+
+        assert column_specs == __TEST_TRAINING_COLUMN_SPECS
+        column_specs[
+            _TEST_TRAINING_COLUMN_NAMES[0]
+        ] = aiplatform.column.data_types.NUMERIC
+
         job = training_jobs.AutoMLTabularTrainingJob(
             display_name=_TEST_DISPLAY_NAME,
             optimization_objective=_TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME,
             optimization_prediction_type=_TEST_TRAINING_OPTIMIZATION_PREDICTION_TYPE,
-            column_transformations=_TEST_TRAINING_COLUMN_TRANSFORMATIONS,
+            column_specs=column_specs,
             optimization_objective_recall_value=None,
             optimization_objective_precision_value=None,
         )
@@ -315,11 +324,17 @@ def test_run_call_pipeline_if_no_model_display_name(
     ):
         aiplatform.init(project=_TEST_PROJECT, staging_bucket=_TEST_BUCKET_NAME)
 
+        column_specs = training_jobs.AutoMLTabularTrainingJob.get_auto_column_specs(
+            dataset=mock_dataset_tabular, target_column=_TEST_TRAINING_TARGET_COLUMN,
+        )
+
+        assert column_specs == __TEST_TRAINING_COLUMN_SPECS
+
         job = training_jobs.AutoMLTabularTrainingJob(
             display_name=_TEST_DISPLAY_NAME,
             optimization_objective=_TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME,
             optimization_prediction_type=_TEST_TRAINING_OPTIMIZATION_PREDICTION_TYPE,
-            column_transformations=_TEST_TRAINING_COLUMN_TRANSFORMATIONS,
+            column_specs=column_specs,
             optimization_objective_recall_value=None,
             optimization_objective_precision_value=None,
             training_encryption_spec_key_name=_TEST_PIPELINE_ENCRYPTION_KEY_NAME,
diff --git a/tests/unit/aiplatform/test_datasets.py b/tests/unit/aiplatform/test_datasets.py
index 7bb2215491..4c2a75c393 100644
--- a/tests/unit/aiplatform/test_datasets.py
+++ b/tests/unit/aiplatform/test_datasets.py
@@ -1005,44 +1005,6 @@ def test_tabular_dataset_column_name_bigquery(self):
 
         assert my_dataset.column_names == ["column_1", "column_2"]
 
-    @pytest.mark.usefixtures(
-        "get_dataset_tabular_gcs_mock", "gcs_client_download_as_bytes_mock"
-    )
-    def test_tabular_dataset_column_name_gcs(self):
-        my_dataset = datasets.TabularDataset(dataset_name=_TEST_NAME)
-
-        with pytest.raises(ValueError):
-            my_dataset.auto_column_specs("column_3")
-
-    @pytest.mark.usefixtures(
-        "get_dataset_tabular_gcs_mock", "gcs_client_download_as_bytes_mock"
-    )
-    def test_tabular_dataset_column_name_gcs(self):
-        my_dataset = datasets.TabularDataset(dataset_name=_TEST_NAME)
-
-        assert my_dataset.auto_column_specs("column_2") == {"column_1": "auto"}
-
-    @pytest.mark.usefixtures(
-        "get_dataset_tabular_bq_mock",
-        "bigquery_client_mock",
-        "bigquery_table_schema_mock",
-    )
-    def test_tabular_dataset_column_name_bigquery(self):
-        my_dataset = datasets.TabularDataset(dataset_name=_TEST_NAME)
-
-        with pytest.raises(ValueError):
-            my_dataset.auto_column_specs("column_3")
-
-    @pytest.mark.usefixtures(
-        "get_dataset_tabular_bq_mock",
-        "bigquery_client_mock",
-        "bigquery_table_schema_mock",
-    )
-    def test_tabular_dataset_column_name_bigquery(self):
-        my_dataset = datasets.TabularDataset(dataset_name=_TEST_NAME)
-
-        assert my_dataset.auto_column_specs("column_2") == {"column_1": "auto"}
-
 
 class TestTextDataset:
     def setup_method(self):

From 2d52e0dd77845e660087eb72eaf397faa0e77e41 Mon Sep 17 00:00:00 2001
From: Torry Yang <torryyang@gmail.com>
Date: Tue, 15 Jun 2021 08:32:52 -0400
Subject: [PATCH 04/22] fix test fails

---
 google/cloud/aiplatform/training_jobs.py                   | 4 ++--
 tests/unit/aiplatform/test_automl_tabular_training_jobs.py | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/google/cloud/aiplatform/training_jobs.py b/google/cloud/aiplatform/training_jobs.py
index 3179686137..4344595db6 100644
--- a/google/cloud/aiplatform/training_jobs.py
+++ b/google/cloud/aiplatform/training_jobs.py
@@ -2665,9 +2665,9 @@ def _run(
             for transformation in column_transformations:
                 for data_type in transformation:
                     column = transformation[data_type]
-                    if column not in column_names:
+                    if column['column_name'] not in column_names:
                         raise ValueError(f"'{column}' is not in the dataset.")
-                    if column is target_column:
+                    if column['column_name'] is target_column:
                         raise ValueError("Target column is in transformations.")
         # auto-populate transformations
         if column_transformations is None:
diff --git a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
index 1e6671c6e5..36f4749228 100644
--- a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
+++ b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
@@ -42,6 +42,7 @@
     "sepal_length",
     "petal_length",
     "petal_width",
+    "target",
 ]
 
 _TEST_TRAINING_COLUMN_TRANSFORMATIONS = [

From 42ae3e7f4563862f7bfdcf78f07ae440993ee855 Mon Sep 17 00:00:00 2001
From: Torry Yang <sirtorry@users.noreply.github.com>
Date: Tue, 15 Jun 2021 15:16:36 -0400
Subject: [PATCH 05/22] fix typo

---
 tests/unit/aiplatform/test_automl_tabular_training_jobs.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
index 36f4749228..438301bc90 100644
--- a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
+++ b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
@@ -51,7 +51,7 @@
     {"auto": {"column_name": "petal_length"}},
     {"auto": {"column_name": "petal_width"}},
 ]
-__TEST_TRAINING_COLUMN_SPECS = {
+_TEST_TRAINING_COLUMN_SPECS = {
     "sepal_width": "auto",
     "sepal_length": "auto",
     "sepal_width": "auto",
@@ -238,7 +238,7 @@ def test_run_call_pipeline_service_create(
             dataset=mock_dataset_tabular, target_column=_TEST_TRAINING_TARGET_COLUMN,
         )
 
-        assert column_specs == __TEST_TRAINING_COLUMN_SPECS
+        assert column_specs == _TEST_TRAINING_COLUMN_SPECS
         column_specs[
             _TEST_TRAINING_COLUMN_NAMES[0]
         ] = aiplatform.column.data_types.NUMERIC
@@ -329,7 +329,7 @@ def test_run_call_pipeline_if_no_model_display_name(
             dataset=mock_dataset_tabular, target_column=_TEST_TRAINING_TARGET_COLUMN,
         )
 
-        assert column_specs == __TEST_TRAINING_COLUMN_SPECS
+        assert column_specs == _TEST_TRAINING_COLUMN_SPECS
 
         job = training_jobs.AutoMLTabularTrainingJob(
             display_name=_TEST_DISPLAY_NAME,

From 7789d4e8cf98f6cd0a4b158df9b2ccf70c3aed8b Mon Sep 17 00:00:00 2001
From: sirtorry <torryyang@google.com>
Date: Tue, 15 Jun 2021 16:18:36 -0400
Subject: [PATCH 06/22] test bug

---
 tests/unit/aiplatform/test_automl_tabular_training_jobs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
index 438301bc90..2cb137b8b0 100644
--- a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
+++ b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
@@ -54,8 +54,8 @@
 _TEST_TRAINING_COLUMN_SPECS = {
     "sepal_width": "auto",
     "sepal_length": "auto",
-    "sepal_width": "auto",
-    "sepal_width": "auto",
+    "petal_width": "auto",
+    "petal_length": "auto",
 }
 _TEST_TRAINING_TARGET_COLUMN = "target"
 _TEST_TRAINING_BUDGET_MILLI_NODE_HOURS = 1000

From 3e6c94063ff46430504e5ca1f48c0211960e695d Mon Sep 17 00:00:00 2001
From: sirtorry <torryyang@google.com>
Date: Tue, 15 Jun 2021 16:38:47 -0400
Subject: [PATCH 07/22] add column to init

---
 google/cloud/aiplatform/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/google/cloud/aiplatform/__init__.py b/google/cloud/aiplatform/__init__.py
index 6aa8f64161..39b88a08d9 100644
--- a/google/cloud/aiplatform/__init__.py
+++ b/google/cloud/aiplatform/__init__.py
@@ -45,6 +45,7 @@
     AutoMLTextTrainingJob,
     AutoMLVideoTrainingJob,
 )
+from google.cloud.aiplatform.column import data_types
 
 """
 Usage:

From ab911095662508a5593a9f912f3e25858015084f Mon Sep 17 00:00:00 2001
From: sirtorry <torryyang@google.com>
Date: Tue, 15 Jun 2021 17:11:19 -0400
Subject: [PATCH 08/22] modify test

---
 tests/unit/aiplatform/test_automl_tabular_training_jobs.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
index 2cb137b8b0..37c6e4ca0d 100644
--- a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
+++ b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
@@ -242,6 +242,10 @@ def test_run_call_pipeline_service_create(
         column_specs[
             _TEST_TRAINING_COLUMN_NAMES[0]
         ] = aiplatform.column.data_types.NUMERIC
+        column_specs[
+            _TEST_TRAINING_COLUMN_NAMES[0]
+        ] = aiplatform.column.data_types.AUTO
+
 
         job = training_jobs.AutoMLTabularTrainingJob(
             display_name=_TEST_DISPLAY_NAME,

From 9a09ae1b7c8e5e74c333e30b50a95f354857d91d Mon Sep 17 00:00:00 2001
From: sirtorry <torryyang@google.com>
Date: Tue, 15 Jun 2021 19:09:15 -0400
Subject: [PATCH 09/22] lint

---
 google/cloud/aiplatform/__init__.py                        | 1 -
 google/cloud/aiplatform/training_jobs.py                   | 4 ++--
 tests/unit/aiplatform/test_automl_tabular_training_jobs.py | 5 +----
 3 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/google/cloud/aiplatform/__init__.py b/google/cloud/aiplatform/__init__.py
index 39b88a08d9..6aa8f64161 100644
--- a/google/cloud/aiplatform/__init__.py
+++ b/google/cloud/aiplatform/__init__.py
@@ -45,7 +45,6 @@
     AutoMLTextTrainingJob,
     AutoMLVideoTrainingJob,
 )
-from google.cloud.aiplatform.column import data_types
 
 """
 Usage:
diff --git a/google/cloud/aiplatform/training_jobs.py b/google/cloud/aiplatform/training_jobs.py
index 4344595db6..3a9105aaa7 100644
--- a/google/cloud/aiplatform/training_jobs.py
+++ b/google/cloud/aiplatform/training_jobs.py
@@ -2665,9 +2665,9 @@ def _run(
             for transformation in column_transformations:
                 for data_type in transformation:
                     column = transformation[data_type]
-                    if column['column_name'] not in column_names:
+                    if column["column_name"] not in column_names:
                         raise ValueError(f"'{column}' is not in the dataset.")
-                    if column['column_name'] is target_column:
+                    if column["column_name"] is target_column:
                         raise ValueError("Target column is in transformations.")
         # auto-populate transformations
         if column_transformations is None:
diff --git a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
index 37c6e4ca0d..d7016282c3 100644
--- a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
+++ b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
@@ -242,10 +242,7 @@ def test_run_call_pipeline_service_create(
         column_specs[
             _TEST_TRAINING_COLUMN_NAMES[0]
         ] = aiplatform.column.data_types.NUMERIC
-        column_specs[
-            _TEST_TRAINING_COLUMN_NAMES[0]
-        ] = aiplatform.column.data_types.AUTO
-
+        column_specs[_TEST_TRAINING_COLUMN_NAMES[0]] = aiplatform.column.data_types.AUTO
 
         job = training_jobs.AutoMLTabularTrainingJob(
             display_name=_TEST_DISPLAY_NAME,

From 9a1ce72145a5a0eb4b1fc72b7f7d394254ae2b6d Mon Sep 17 00:00:00 2001
From: sirtorry <torryyang@google.com>
Date: Tue, 15 Jun 2021 19:52:10 -0400
Subject: [PATCH 10/22] undo column removal from init

---
 google/cloud/aiplatform/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/google/cloud/aiplatform/__init__.py b/google/cloud/aiplatform/__init__.py
index 6aa8f64161..fcad51b35b 100644
--- a/google/cloud/aiplatform/__init__.py
+++ b/google/cloud/aiplatform/__init__.py
@@ -45,6 +45,7 @@
     AutoMLTextTrainingJob,
     AutoMLVideoTrainingJob,
 )
+from google.cloud.aiplatform.column import data_types
 
 """
 Usage:
@@ -89,4 +90,5 @@
     "TextDataset",
     "TimeSeriesDataset",
     "VideoDataset",
+    "data_types",
 )

From d738b55e23c5935a221ad8784662b9186746308b Mon Sep 17 00:00:00 2001
From: Torry Yang <torryyang@google.com>
Date: Sat, 19 Jun 2021 18:24:38 -0400
Subject: [PATCH 11/22] address comments

---
 .../datasets/time_series_dataset.py           |  2 +-
 google/cloud/aiplatform/training_jobs.py      | 20 +++++++++----------
 .../test_automl_tabular_training_jobs.py      |  1 +
 3 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/google/cloud/aiplatform/datasets/time_series_dataset.py b/google/cloud/aiplatform/datasets/time_series_dataset.py
index f9f9b865d8..8070daf10f 100644
--- a/google/cloud/aiplatform/datasets/time_series_dataset.py
+++ b/google/cloud/aiplatform/datasets/time_series_dataset.py
@@ -47,7 +47,7 @@ def create(
         encryption_spec_key_name: Optional[str] = None,
         sync: bool = True,
     ) -> "TimeSeriesDataset":
-        """Creates a new d dataset.
+        """Creates a new time series dataset.
 
         Args:
             display_name (str):
diff --git a/google/cloud/aiplatform/training_jobs.py b/google/cloud/aiplatform/training_jobs.py
index 3a9105aaa7..124a70eb66 100644
--- a/google/cloud/aiplatform/training_jobs.py
+++ b/google/cloud/aiplatform/training_jobs.py
@@ -2300,8 +2300,8 @@ def __init__(
         display_name: str,
         optimization_prediction_type: str,
         optimization_objective: Optional[str] = None,
-        column_transformations: Optional[Union[Dict, List[Dict]]] = None,
         column_specs: Optional[Dict[str, str]] = None,
+        column_transformations: Optional[Union[Dict, List[Dict]]] = None,
         optimization_objective_recall_value: Optional[float] = None,
         optimization_objective_precision_value: Optional[float] = None,
         project: Optional[str] = None,
@@ -2352,7 +2352,7 @@ def __init__(
                 "minimize-rmse" (default) - Minimize root-mean-squared error (RMSE).
                 "minimize-mae" - Minimize mean-absolute error (MAE).
                 "minimize-rmsle" - Minimize root-mean-squared log error (RMSLE).
-            column_transformations (Optional[Union[Dict, List[Dict]]]):
+            column_specs (Optional[Dict[str, str]]):
                 Optional. Transformations to apply to the input columns (i.e. columns other
                 than the targetColumn). Each transformation may produce multiple
                 result values from the column's value, and all are used for training.
@@ -2361,8 +2361,7 @@ def __init__(
                 If an input column has no transformations on it, such a column is
                 ignored by the training, except for the targetColumn, which should have
                 no transformations defined on.
-                Only one of column_transformations or column_specs should be passed.
-            column_specs (Optional[Dict[str, str]]):
+            column_transformations (Optional[Union[Dict, List[Dict]]]):
                 Optional. Transformations to apply to the input columns (i.e. columns other
                 than the targetColumn). Each transformation may produce multiple
                 result values from the column's value, and all are used for training.
@@ -2372,6 +2371,7 @@ def __init__(
                 ignored by the training, except for the targetColumn, which should have
                 no transformations defined on.
                 Only one of column_transformations or column_specs should be passed.
+                Only one of column_transformations or column_specs should be passed.
             optimization_objective_recall_value (float):
                 Optional. Required when maximize-precision-at-recall optimizationObjective was
                 picked, represents the recall value at which the optimization is done.
@@ -2655,12 +2655,6 @@ def _run(
             )
         if self._column_transformations is not None:
             column_transformations = self._column_transformations
-        if self._column_specs is not None and column_transformations is None:
-            column_transformations = [
-                {self._column_specs[column]: {"column_name": column}}
-                for column in self._column_specs
-            ]
-        if column_transformations is not None:
             column_names = dataset.column_names
             for transformation in column_transformations:
                 for data_type in transformation:
@@ -2669,6 +2663,11 @@ def _run(
                         raise ValueError(f"'{column}' is not in the dataset.")
                     if column["column_name"] is target_column:
                         raise ValueError("Target column is in transformations.")
+        elif self._column_specs is not None:
+            column_transformations = [
+                {self._column_specs[column]: {"column_name": column}}
+                for column in self._column_specs
+            ]
         # auto-populate transformations
         if column_transformations is None:
             _LOGGER.info(
@@ -2743,7 +2742,6 @@ def _add_additional_experiments(self, additional_experiments: List[str]):
         """
         self._additional_experiments.extend(additional_experiments)
 
-    @classmethod
     def get_auto_column_specs(
         self, dataset: datasets.TabularDataset, target_column: str,
     ) -> Dict[str, str]:
diff --git a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
index d7016282c3..5ced76a3ae 100644
--- a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
+++ b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
@@ -242,6 +242,7 @@ def test_run_call_pipeline_service_create(
         column_specs[
             _TEST_TRAINING_COLUMN_NAMES[0]
         ] = aiplatform.column.data_types.NUMERIC
+        assert column_specs[_TEST_TRAINING_COLUMN_NAMES[0]] is "numeric"
         column_specs[_TEST_TRAINING_COLUMN_NAMES[0]] = aiplatform.column.data_types.AUTO
 
         job = training_jobs.AutoMLTabularTrainingJob(

From 5fef60b1b75188c73250379930d4f3cba0a9db9a Mon Sep 17 00:00:00 2001
From: Torry Yang <torryyang@google.com>
Date: Sat, 19 Jun 2021 20:47:04 -0400
Subject: [PATCH 12/22] remove self arg

---
 google/cloud/aiplatform/training_jobs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/google/cloud/aiplatform/training_jobs.py b/google/cloud/aiplatform/training_jobs.py
index 514b740f6b..72ad39be95 100644
--- a/google/cloud/aiplatform/training_jobs.py
+++ b/google/cloud/aiplatform/training_jobs.py
@@ -2971,7 +2971,7 @@ def _add_additional_experiments(self, additional_experiments: List[str]):
         self._additional_experiments.extend(additional_experiments)
 
     def get_auto_column_specs(
-        self, dataset: datasets.TabularDataset, target_column: str,
+        dataset: datasets.TabularDataset, target_column: str,
     ) -> Dict[str, str]:
         """Returns a dict with all non-target columns as keys and 'auto' as values.
         Args:

From 23ac4c21534ad9e8f2f3a671baaf02add540fb08 Mon Sep 17 00:00:00 2001
From: Torry Yang <torryyang@google.com>
Date: Sun, 20 Jun 2021 14:34:26 -0400
Subject: [PATCH 13/22] lint

---
 google/cloud/aiplatform/training_jobs.py      | 27 ++++++----
 .../test_automl_tabular_training_jobs.py      | 50 ++++++++++++-------
 2 files changed, 48 insertions(+), 29 deletions(-)

diff --git a/google/cloud/aiplatform/training_jobs.py b/google/cloud/aiplatform/training_jobs.py
index 72ad39be95..339c69c6ef 100644
--- a/google/cloud/aiplatform/training_jobs.py
+++ b/google/cloud/aiplatform/training_jobs.py
@@ -697,7 +697,9 @@ def _get_model(self) -> Optional[models.Model]:
             )
 
             return models.Model(
-                fields.id, project=fields.project, location=fields.location,
+                fields.id,
+                project=fields.project,
+                location=fields.location,
             )
 
     def _wait_callback(self):
@@ -1161,12 +1163,14 @@ def _prepare_and_validate_run(
             model_display_name = model_display_name or self._display_name + "-model"
 
         # validates args and will raise
-        worker_pool_specs = worker_spec_utils._DistributedTrainingSpec.chief_worker_pool(
-            replica_count=replica_count,
-            machine_type=machine_type,
-            accelerator_count=accelerator_count,
-            accelerator_type=accelerator_type,
-        ).pool_specs
+        worker_pool_specs = (
+            worker_spec_utils._DistributedTrainingSpec.chief_worker_pool(
+                replica_count=replica_count,
+                machine_type=machine_type,
+                accelerator_count=accelerator_count,
+                accelerator_type=accelerator_type,
+            ).pool_specs
+        )
 
         managed_model = self._managed_model
         if model_display_name:
@@ -2971,7 +2975,8 @@ def _add_additional_experiments(self, additional_experiments: List[str]):
         self._additional_experiments.extend(additional_experiments)
 
     def get_auto_column_specs(
-        dataset: datasets.TabularDataset, target_column: str,
+        dataset: datasets.TabularDataset,
+        target_column: str,
     ) -> Dict[str, str]:
         """Returns a dict with all non-target columns as keys and 'auto' as values.
         Args:
@@ -4791,8 +4796,10 @@ def __init__(
                 schema.training_job.definition.automl_text_classification
             )
 
-            training_task_inputs_dict = training_job_inputs.AutoMlTextClassificationInputs(
-                multi_label=multi_label
+            training_task_inputs_dict = (
+                training_job_inputs.AutoMlTextClassificationInputs(
+                    multi_label=multi_label
+                )
             )
         elif prediction_type == "extraction":
             training_task_definition = (
diff --git a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
index 5ced76a3ae..9433bdeae7 100644
--- a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
+++ b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
@@ -78,7 +78,8 @@
     "optimizationObjectivePrecisionValue": None,
 }
 _TEST_TRAINING_TASK_INPUTS = json_format.ParseDict(
-    _TEST_TRAINING_TASK_INPUTS_DICT, struct_pb2.Value(),
+    _TEST_TRAINING_TASK_INPUTS_DICT,
+    struct_pb2.Value(),
 )
 _TEST_TRAINING_TASK_INPUTS_WITH_ADDITIONAL_EXPERIMENTS = json_format.ParseDict(
     {
@@ -126,10 +127,12 @@ def mock_pipeline_service_create():
     with mock.patch.object(
         pipeline_service_client.PipelineServiceClient, "create_training_pipeline"
     ) as mock_create_training_pipeline:
-        mock_create_training_pipeline.return_value = gca_training_pipeline.TrainingPipeline(
-            name=_TEST_PIPELINE_RESOURCE_NAME,
-            state=gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED,
-            model_to_upload=gca_model.Model(name=_TEST_MODEL_NAME),
+        mock_create_training_pipeline.return_value = (
+            gca_training_pipeline.TrainingPipeline(
+                name=_TEST_PIPELINE_RESOURCE_NAME,
+                state=gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED,
+                model_to_upload=gca_model.Model(name=_TEST_MODEL_NAME),
+            )
         )
         yield mock_create_training_pipeline
 
@@ -139,10 +142,12 @@ def mock_pipeline_service_get():
     with mock.patch.object(
         pipeline_service_client.PipelineServiceClient, "get_training_pipeline"
     ) as mock_get_training_pipeline:
-        mock_get_training_pipeline.return_value = gca_training_pipeline.TrainingPipeline(
-            name=_TEST_PIPELINE_RESOURCE_NAME,
-            state=gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED,
-            model_to_upload=gca_model.Model(name=_TEST_MODEL_NAME),
+        mock_get_training_pipeline.return_value = (
+            gca_training_pipeline.TrainingPipeline(
+                name=_TEST_PIPELINE_RESOURCE_NAME,
+                state=gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED,
+                model_to_upload=gca_model.Model(name=_TEST_MODEL_NAME),
+            )
         )
         yield mock_get_training_pipeline
 
@@ -152,17 +157,21 @@ def mock_pipeline_service_create_and_get_with_fail():
     with mock.patch.object(
         pipeline_service_client.PipelineServiceClient, "create_training_pipeline"
     ) as mock_create_training_pipeline:
-        mock_create_training_pipeline.return_value = gca_training_pipeline.TrainingPipeline(
-            name=_TEST_PIPELINE_RESOURCE_NAME,
-            state=gca_pipeline_state.PipelineState.PIPELINE_STATE_RUNNING,
+        mock_create_training_pipeline.return_value = (
+            gca_training_pipeline.TrainingPipeline(
+                name=_TEST_PIPELINE_RESOURCE_NAME,
+                state=gca_pipeline_state.PipelineState.PIPELINE_STATE_RUNNING,
+            )
         )
 
         with mock.patch.object(
             pipeline_service_client.PipelineServiceClient, "get_training_pipeline"
         ) as mock_get_training_pipeline:
-            mock_get_training_pipeline.return_value = gca_training_pipeline.TrainingPipeline(
-                name=_TEST_PIPELINE_RESOURCE_NAME,
-                state=gca_pipeline_state.PipelineState.PIPELINE_STATE_FAILED,
+            mock_get_training_pipeline.return_value = (
+                gca_training_pipeline.TrainingPipeline(
+                    name=_TEST_PIPELINE_RESOURCE_NAME,
+                    state=gca_pipeline_state.PipelineState.PIPELINE_STATE_FAILED,
+                )
             )
 
             yield mock_create_training_pipeline, mock_get_training_pipeline
@@ -235,14 +244,15 @@ def test_run_call_pipeline_service_create(
         )
 
         column_specs = training_jobs.AutoMLTabularTrainingJob.get_auto_column_specs(
-            dataset=mock_dataset_tabular, target_column=_TEST_TRAINING_TARGET_COLUMN,
+            dataset=mock_dataset_tabular,
+            target_column=_TEST_TRAINING_TARGET_COLUMN,
         )
 
         assert column_specs == _TEST_TRAINING_COLUMN_SPECS
         column_specs[
             _TEST_TRAINING_COLUMN_NAMES[0]
         ] = aiplatform.column.data_types.NUMERIC
-        assert column_specs[_TEST_TRAINING_COLUMN_NAMES[0]] is "numeric"
+        assert column_specs[_TEST_TRAINING_COLUMN_NAMES[0]] == "numeric"
         column_specs[_TEST_TRAINING_COLUMN_NAMES[0]] = aiplatform.column.data_types.AUTO
 
         job = training_jobs.AutoMLTabularTrainingJob(
@@ -328,7 +338,8 @@ def test_run_call_pipeline_if_no_model_display_name(
         aiplatform.init(project=_TEST_PROJECT, staging_bucket=_TEST_BUCKET_NAME)
 
         column_specs = training_jobs.AutoMLTabularTrainingJob.get_auto_column_specs(
-            dataset=mock_dataset_tabular, target_column=_TEST_TRAINING_TARGET_COLUMN,
+            dataset=mock_dataset_tabular,
+            target_column=_TEST_TRAINING_TARGET_COLUMN,
         )
 
         assert column_specs == _TEST_TRAINING_COLUMN_SPECS
@@ -370,7 +381,8 @@ def test_run_call_pipeline_if_no_model_display_name(
         )
 
         true_input_data_config = gca_training_pipeline.InputDataConfig(
-            fraction_split=true_fraction_split, dataset_id=mock_dataset_tabular.name,
+            fraction_split=true_fraction_split,
+            dataset_id=mock_dataset_tabular.name,
         )
 
         true_training_pipeline = gca_training_pipeline.TrainingPipeline(

From d5af1e65342e93ad1fc8012413cc68a5abbdcb33 Mon Sep 17 00:00:00 2001
From: Torry Yang <torryyang@google.com>
Date: Thu, 8 Jul 2021 22:42:07 -0400
Subject: [PATCH 14/22] address feedback

---
 google/cloud/aiplatform/__init__.py           |  4 +-
 .../cloud/aiplatform/{column.py => automl.py} | 19 ++--
 google/cloud/aiplatform/training_jobs.py      | 93 ++++++++++---------
 .../test_automl_tabular_training_jobs.py      | 54 +++++------
 4 files changed, 82 insertions(+), 88 deletions(-)
 rename google/cloud/aiplatform/{column.py => automl.py} (66%)

diff --git a/google/cloud/aiplatform/__init__.py b/google/cloud/aiplatform/__init__.py
index 78fb4dadb6..1fd1b0e8f3 100644
--- a/google/cloud/aiplatform/__init__.py
+++ b/google/cloud/aiplatform/__init__.py
@@ -46,7 +46,7 @@
     AutoMLTextTrainingJob,
     AutoMLVideoTrainingJob,
 )
-from google.cloud.aiplatform.column import data_types
+from google.cloud.aiplatform.automl import column_data_types
 
 """
 Usage:
@@ -92,5 +92,5 @@
     "TextDataset",
     "TimeSeriesDataset",
     "VideoDataset",
-    "data_types",
+    "column_data_types",
 )
diff --git a/google/cloud/aiplatform/column.py b/google/cloud/aiplatform/automl.py
similarity index 66%
rename from google/cloud/aiplatform/column.py
rename to google/cloud/aiplatform/automl.py
index 159b3cf415..59e4675a5a 100644
--- a/google/cloud/aiplatform/column.py
+++ b/google/cloud/aiplatform/automl.py
@@ -18,12 +18,13 @@
 """Vertex AI tabular data types"""
 
 
-class data_types:
-    AUTO = "auto"
-    NUMERIC = "numeric"
-    CATEGORICAL = "categorical"
-    TIMESTAMP = "timestamp"
-    TEXT = "text"
-    REPEATED_NUMERIC = "repeated_numeric"
-    REPEATED_CATEGORICAL = "repeated_categorical"
-    REPEATED_TEXT = "repeated_text"
+class tabular:
+    class column_data_types:
+        AUTO = "auto"
+        NUMERIC = "numeric"
+        CATEGORICAL = "categorical"
+        TIMESTAMP = "timestamp"
+        TEXT = "text"
+        REPEATED_NUMERIC = "repeated_numeric"
+        REPEATED_CATEGORICAL = "repeated_categorical"
+        REPEATED_TEXT = "repeated_text"
diff --git a/google/cloud/aiplatform/training_jobs.py b/google/cloud/aiplatform/training_jobs.py
index 339c69c6ef..c324f8af9e 100644
--- a/google/cloud/aiplatform/training_jobs.py
+++ b/google/cloud/aiplatform/training_jobs.py
@@ -697,9 +697,7 @@ def _get_model(self) -> Optional[models.Model]:
             )
 
             return models.Model(
-                fields.id,
-                project=fields.project,
-                location=fields.location,
+                fields.id, project=fields.project, location=fields.location,
             )
 
     def _wait_callback(self):
@@ -1163,14 +1161,12 @@ def _prepare_and_validate_run(
             model_display_name = model_display_name or self._display_name + "-model"
 
         # validates args and will raise
-        worker_pool_specs = (
-            worker_spec_utils._DistributedTrainingSpec.chief_worker_pool(
-                replica_count=replica_count,
-                machine_type=machine_type,
-                accelerator_count=accelerator_count,
-                accelerator_type=accelerator_type,
-            ).pool_specs
-        )
+        worker_pool_specs = worker_spec_utils._DistributedTrainingSpec.chief_worker_pool(
+            replica_count=replica_count,
+            machine_type=machine_type,
+            accelerator_count=accelerator_count,
+            accelerator_type=accelerator_type,
+        ).pool_specs
 
         managed_model = self._managed_model
         if model_display_name:
@@ -2540,6 +2536,15 @@ def __init__(
     ):
         """Constructs a AutoML Tabular Training Job.
 
+        Example usage:
+
+        job = training_jobs.AutoMLTabularTrainingJob(
+            display_name="my_display_name",
+            optimization_prediction_type="classification",
+            optimization_objective="minimize-log-loss",
+            column_specs=my_column_specs,
+        )
+
         Args:
             display_name (str):
                 Required. The user-defined name of this TrainingPipeline.
@@ -2580,7 +2585,7 @@ def __init__(
                 "minimize-rmse" (default) - Minimize root-mean-squared error (RMSE).
                 "minimize-mae" - Minimize mean-absolute error (MAE).
                 "minimize-rmsle" - Minimize root-mean-squared log error (RMSLE).
-            column_specs (Optional[Dict[str, str]]):
+            column_specs (Dict[str, str]):
                 Optional. Transformations to apply to the input columns (i.e. columns other
                 than the targetColumn). Each transformation may produce multiple
                 result values from the column's value, and all are used for training.
@@ -2589,7 +2594,8 @@ def __init__(
                 If an input column has no transformations on it, such a column is
                 ignored by the training, except for the targetColumn, which should have
                 no transformations defined on.
-            column_transformations (Optional[Union[Dict, List[Dict]]]):
+                Only one of column_transformations or column_specs should be passed.
+            column_transformations (Union[Dict, List[Dict]]):
                 Optional. Transformations to apply to the input columns (i.e. columns other
                 than the targetColumn). Each transformation may produce multiple
                 result values from the column's value, and all are used for training.
@@ -2599,7 +2605,6 @@ def __init__(
                 ignored by the training, except for the targetColumn, which should have
                 no transformations defined on.
                 Only one of column_transformations or column_specs should be passed.
-                Only one of column_transformations or column_specs should be passed.
             optimization_objective_recall_value (float):
                 Optional. Required when maximize-precision-at-recall optimizationObjective was
                 picked, represents the recall value at which the optimization is done.
@@ -2652,8 +2657,16 @@ def __init__(
             training_encryption_spec_key_name=training_encryption_spec_key_name,
             model_encryption_spec_key_name=model_encryption_spec_key_name,
         )
-        self._column_transformations = column_transformations
-        self._column_specs = column_specs
+        # user populated transformations
+        if self._column_transformations is not None and self._column_specs is not None:
+            _LOGGER.info(
+                "column_transformations and column_specs were both passed. column_transformations was used."
+            )
+        if column_transformations is not None:
+            self._column_transformations = column_transformations
+            self._column_specs = None
+        elif column_specs is not None:
+            self._column_specs = column_specs
         self._optimization_objective = optimization_objective
         self._optimization_prediction_type = optimization_prediction_type
         self._optimization_objective_recall_value = optimization_objective_recall_value
@@ -2880,28 +2893,13 @@ def _run(
         training_task_definition = schema.training_job.definition.automl_tabular
         column_transformations = None
 
-        # user populated transformations
-        if self._column_transformations is not None and self._column_specs is not None:
-            _LOGGER.info(
-                "column_transformations and column_specs were both passed. column_transformations was used."
-            )
-        if self._column_transformations is not None:
-            column_transformations = self._column_transformations
-            column_names = dataset.column_names
-            for transformation in column_transformations:
-                for data_type in transformation:
-                    column = transformation[data_type]
-                    if column["column_name"] not in column_names:
-                        raise ValueError(f"'{column}' is not in the dataset.")
-                    if column["column_name"] is target_column:
-                        raise ValueError("Target column is in transformations.")
-        elif self._column_specs is not None:
-            column_transformations = [
-                {self._column_specs[column]: {"column_name": column}}
-                for column in self._column_specs
+        # convert column specs to column transformations
+        if self._column_specs is not None:
+            self._column_transformations = [
+                {item[1]: {"column_name": item[0]}} for item in self._column_specs.items
             ]
         # auto-populate transformations
-        if column_transformations is None:
+        if self._column_transformations is None:
             _LOGGER.info(
                 "No column transformations provided, so now retrieving columns from dataset in order to set default column transformations."
             )
@@ -2911,7 +2909,7 @@ def _run(
                 for column_name in dataset.column_names
                 if column_name != target_column
             ]
-            column_transformations = [
+            self._column_transformations = [
                 {"auto": {"column_name": column_name}} for column_name in column_names
             ]
 
@@ -2923,7 +2921,7 @@ def _run(
         training_task_inputs_dict = {
             # required inputs
             "targetColumn": target_column,
-            "transformations": column_transformations,
+            "transformations": self._column_transformations,
             "trainBudgetMilliNodeHours": budget_milli_node_hours,
             # optional inputs
             "weightColumnName": weight_column,
@@ -2974,11 +2972,19 @@ def _add_additional_experiments(self, additional_experiments: List[str]):
         """
         self._additional_experiments.extend(additional_experiments)
 
+    @staticmethod
     def get_auto_column_specs(
-        dataset: datasets.TabularDataset,
-        target_column: str,
+        dataset: datasets.TabularDataset, target_column: str,
     ) -> Dict[str, str]:
         """Returns a dict with all non-target columns as keys and 'auto' as values.
+        
+        Example usage:
+
+        column_specs = training_jobs.AutoMLTabularTrainingJob.get_auto_column_specs(
+            dataset=my_dataset,
+            target_column="my_target_column",
+        )
+
         Args:
             dataset (datasets.TabularDataset):
                 Required. Intended dataset.
@@ -3001,7 +3007,6 @@ def get_auto_column_specs(
         return column_specs
 
 
-# TODO: add tabular sugar to forecasting
 class AutoMLForecastingTrainingJob(_TrainingJob):
     _supported_training_schemas = (schema.training_job.definition.automl_forecasting,)
 
@@ -4796,10 +4801,8 @@ def __init__(
                 schema.training_job.definition.automl_text_classification
             )
 
-            training_task_inputs_dict = (
-                training_job_inputs.AutoMlTextClassificationInputs(
-                    multi_label=multi_label
-                )
+            training_task_inputs_dict = training_job_inputs.AutoMlTextClassificationInputs(
+                multi_label=multi_label
             )
         elif prediction_type == "extraction":
             training_task_definition = (
diff --git a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
index 9433bdeae7..7809bbeefb 100644
--- a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
+++ b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
@@ -78,8 +78,7 @@
     "optimizationObjectivePrecisionValue": None,
 }
 _TEST_TRAINING_TASK_INPUTS = json_format.ParseDict(
-    _TEST_TRAINING_TASK_INPUTS_DICT,
-    struct_pb2.Value(),
+    _TEST_TRAINING_TASK_INPUTS_DICT, struct_pb2.Value(),
 )
 _TEST_TRAINING_TASK_INPUTS_WITH_ADDITIONAL_EXPERIMENTS = json_format.ParseDict(
     {
@@ -127,12 +126,10 @@ def mock_pipeline_service_create():
     with mock.patch.object(
         pipeline_service_client.PipelineServiceClient, "create_training_pipeline"
     ) as mock_create_training_pipeline:
-        mock_create_training_pipeline.return_value = (
-            gca_training_pipeline.TrainingPipeline(
-                name=_TEST_PIPELINE_RESOURCE_NAME,
-                state=gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED,
-                model_to_upload=gca_model.Model(name=_TEST_MODEL_NAME),
-            )
+        mock_create_training_pipeline.return_value = gca_training_pipeline.TrainingPipeline(
+            name=_TEST_PIPELINE_RESOURCE_NAME,
+            state=gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED,
+            model_to_upload=gca_model.Model(name=_TEST_MODEL_NAME),
         )
         yield mock_create_training_pipeline
 
@@ -142,12 +139,10 @@ def mock_pipeline_service_get():
     with mock.patch.object(
         pipeline_service_client.PipelineServiceClient, "get_training_pipeline"
     ) as mock_get_training_pipeline:
-        mock_get_training_pipeline.return_value = (
-            gca_training_pipeline.TrainingPipeline(
-                name=_TEST_PIPELINE_RESOURCE_NAME,
-                state=gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED,
-                model_to_upload=gca_model.Model(name=_TEST_MODEL_NAME),
-            )
+        mock_get_training_pipeline.return_value = gca_training_pipeline.TrainingPipeline(
+            name=_TEST_PIPELINE_RESOURCE_NAME,
+            state=gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED,
+            model_to_upload=gca_model.Model(name=_TEST_MODEL_NAME),
         )
         yield mock_get_training_pipeline
 
@@ -157,21 +152,17 @@ def mock_pipeline_service_create_and_get_with_fail():
     with mock.patch.object(
         pipeline_service_client.PipelineServiceClient, "create_training_pipeline"
     ) as mock_create_training_pipeline:
-        mock_create_training_pipeline.return_value = (
-            gca_training_pipeline.TrainingPipeline(
-                name=_TEST_PIPELINE_RESOURCE_NAME,
-                state=gca_pipeline_state.PipelineState.PIPELINE_STATE_RUNNING,
-            )
+        mock_create_training_pipeline.return_value = gca_training_pipeline.TrainingPipeline(
+            name=_TEST_PIPELINE_RESOURCE_NAME,
+            state=gca_pipeline_state.PipelineState.PIPELINE_STATE_RUNNING,
         )
 
         with mock.patch.object(
             pipeline_service_client.PipelineServiceClient, "get_training_pipeline"
         ) as mock_get_training_pipeline:
-            mock_get_training_pipeline.return_value = (
-                gca_training_pipeline.TrainingPipeline(
-                    name=_TEST_PIPELINE_RESOURCE_NAME,
-                    state=gca_pipeline_state.PipelineState.PIPELINE_STATE_FAILED,
-                )
+            mock_get_training_pipeline.return_value = gca_training_pipeline.TrainingPipeline(
+                name=_TEST_PIPELINE_RESOURCE_NAME,
+                state=gca_pipeline_state.PipelineState.PIPELINE_STATE_FAILED,
             )
 
             yield mock_create_training_pipeline, mock_get_training_pipeline
@@ -244,16 +235,17 @@ def test_run_call_pipeline_service_create(
         )
 
         column_specs = training_jobs.AutoMLTabularTrainingJob.get_auto_column_specs(
-            dataset=mock_dataset_tabular,
-            target_column=_TEST_TRAINING_TARGET_COLUMN,
+            dataset=mock_dataset_tabular, target_column=_TEST_TRAINING_TARGET_COLUMN,
         )
 
         assert column_specs == _TEST_TRAINING_COLUMN_SPECS
         column_specs[
             _TEST_TRAINING_COLUMN_NAMES[0]
-        ] = aiplatform.column.data_types.NUMERIC
+        ] = aiplatform.automl.tabular.column_data_types.NUMERIC
         assert column_specs[_TEST_TRAINING_COLUMN_NAMES[0]] == "numeric"
-        column_specs[_TEST_TRAINING_COLUMN_NAMES[0]] = aiplatform.column.data_types.AUTO
+        column_specs[
+            _TEST_TRAINING_COLUMN_NAMES[0]
+        ] = aiplatform.tabular.column.data_types.AUTO
 
         job = training_jobs.AutoMLTabularTrainingJob(
             display_name=_TEST_DISPLAY_NAME,
@@ -338,8 +330,7 @@ def test_run_call_pipeline_if_no_model_display_name(
         aiplatform.init(project=_TEST_PROJECT, staging_bucket=_TEST_BUCKET_NAME)
 
         column_specs = training_jobs.AutoMLTabularTrainingJob.get_auto_column_specs(
-            dataset=mock_dataset_tabular,
-            target_column=_TEST_TRAINING_TARGET_COLUMN,
+            dataset=mock_dataset_tabular, target_column=_TEST_TRAINING_TARGET_COLUMN,
         )
 
         assert column_specs == _TEST_TRAINING_COLUMN_SPECS
@@ -381,8 +372,7 @@ def test_run_call_pipeline_if_no_model_display_name(
         )
 
         true_input_data_config = gca_training_pipeline.InputDataConfig(
-            fraction_split=true_fraction_split,
-            dataset_id=mock_dataset_tabular.name,
+            fraction_split=true_fraction_split, dataset_id=mock_dataset_tabular.name,
         )
 
         true_training_pipeline = gca_training_pipeline.TrainingPipeline(

From b9e21dcfb878f5af24681d326fda126444665b9e Mon Sep 17 00:00:00 2001
From: Torry Yang <torryyang@google.com>
Date: Fri, 9 Jul 2021 20:55:09 -0400
Subject: [PATCH 15/22] add tests

---
 google/cloud/aiplatform/__init__.py           |   2 -
 google/cloud/aiplatform/automl.py             |  30 --
 google/cloud/aiplatform/training_jobs.py      |  13 +-
 .../test_automl_tabular_training_jobs.py      | 329 ++++++++++++++++--
 4 files changed, 315 insertions(+), 59 deletions(-)
 delete mode 100644 google/cloud/aiplatform/automl.py

diff --git a/google/cloud/aiplatform/__init__.py b/google/cloud/aiplatform/__init__.py
index 1fd1b0e8f3..ddf9c4e6e7 100644
--- a/google/cloud/aiplatform/__init__.py
+++ b/google/cloud/aiplatform/__init__.py
@@ -46,7 +46,6 @@
     AutoMLTextTrainingJob,
     AutoMLVideoTrainingJob,
 )
-from google.cloud.aiplatform.automl import column_data_types
 
 """
 Usage:
@@ -92,5 +91,4 @@
     "TextDataset",
     "TimeSeriesDataset",
     "VideoDataset",
-    "column_data_types",
 )
diff --git a/google/cloud/aiplatform/automl.py b/google/cloud/aiplatform/automl.py
deleted file mode 100644
index 59e4675a5a..0000000000
--- a/google/cloud/aiplatform/automl.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2021 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Vertex AI tabular data types"""
-
-
-class tabular:
-    class column_data_types:
-        AUTO = "auto"
-        NUMERIC = "numeric"
-        CATEGORICAL = "categorical"
-        TIMESTAMP = "timestamp"
-        TEXT = "text"
-        REPEATED_NUMERIC = "repeated_numeric"
-        REPEATED_CATEGORICAL = "repeated_categorical"
-        REPEATED_TEXT = "repeated_text"
diff --git a/google/cloud/aiplatform/training_jobs.py b/google/cloud/aiplatform/training_jobs.py
index c324f8af9e..83926e4d7f 100644
--- a/google/cloud/aiplatform/training_jobs.py
+++ b/google/cloud/aiplatform/training_jobs.py
@@ -2891,7 +2891,6 @@ def _run(
         """
 
         training_task_definition = schema.training_job.definition.automl_tabular
-        column_transformations = None
 
         # convert column specs to column transformations
         if self._column_specs is not None:
@@ -2977,7 +2976,7 @@ def get_auto_column_specs(
         dataset: datasets.TabularDataset, target_column: str,
     ) -> Dict[str, str]:
         """Returns a dict with all non-target columns as keys and 'auto' as values.
-        
+
         Example usage:
 
         column_specs = training_jobs.AutoMLTabularTrainingJob.get_auto_column_specs(
@@ -3006,6 +3005,16 @@ def get_auto_column_specs(
         column_specs = {column: "auto" for column in column_names}
         return column_specs
 
+    class column_data_types:
+        AUTO = "auto"
+        NUMERIC = "numeric"
+        CATEGORICAL = "categorical"
+        TIMESTAMP = "timestamp"
+        TEXT = "text"
+        REPEATED_NUMERIC = "repeated_numeric"
+        REPEATED_CATEGORICAL = "repeated_categorical"
+        REPEATED_TEXT = "repeated_text"
+
 
 class AutoMLForecastingTrainingJob(_TrainingJob):
     _supported_training_schemas = (schema.training_job.definition.automl_forecasting,)
diff --git a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
index 7809bbeefb..1ed9d183bc 100644
--- a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
+++ b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
@@ -45,6 +45,13 @@
     "target",
 ]
 
+_TEST_TRAINING_COLUMN_NAMES_ALTERNATIVE = [
+    "apple",
+    "banana",
+    "coconut",
+    "target",
+]
+
 _TEST_TRAINING_COLUMN_TRANSFORMATIONS = [
     {"auto": {"column_name": "sepal_width"}},
     {"auto": {"column_name": "sepal_length"}},
@@ -52,11 +59,20 @@
     {"auto": {"column_name": "petal_width"}},
 ]
 _TEST_TRAINING_COLUMN_SPECS = {
-    "sepal_width": "auto",
-    "sepal_length": "auto",
-    "petal_width": "auto",
-    "petal_length": "auto",
+    "apple": "auto",
+    "banana": "auto",
+    "coconut": "auto",
 }
+_TEST_TRAINING_COLUMN_TRANSFORMATIONS_ALTERNATIVE = [
+    {"auto": {"column_name": "apple"}},
+    {"auto": {"column_name": "banana"}},
+    {"auto": {"column_name": "coconut"}},
+]
+_TEST_TRAINING_COLUMN_TRANSFORMATIONS_ALTERNATIVE_NOT_AUTO = [
+    {"numeric": {"column_name": "apple"}},
+    {"categorical": {"column_name": "banana"}},
+    {"text": {"column_name": "coconut"}},
+]
 _TEST_TRAINING_TARGET_COLUMN = "target"
 _TEST_TRAINING_BUDGET_MILLI_NODE_HOURS = 1000
 _TEST_TRAINING_WEIGHT_COLUMN = "weight"
@@ -87,6 +103,20 @@
     },
     struct_pb2.Value(),
 )
+_TEST_TRAINING_TASK_INPUTS_ALTERNATIVE = json_format.ParseDict(
+    {
+        **_TEST_TRAINING_TASK_INPUTS_DICT,
+        "transformations": _TEST_TRAINING_COLUMN_TRANSFORMATIONS_ALTERNATIVE,
+    },
+    struct_pb2.Value(),
+)
+_TEST_TRAINING_TASK_INPUTS_ALTERNATIVE_NOT_AUTO = json_format.ParseDict(
+    {
+        **_TEST_TRAINING_TASK_INPUTS_DICT,
+        "transformations": _TEST_TRAINING_COLUMN_TRANSFORMATIONS_ALTERNATIVE_NOT_AUTO,
+    },
+    struct_pb2.Value(),
+)
 
 _TEST_DATASET_NAME = "test-dataset-name"
 
@@ -195,6 +225,24 @@ def mock_dataset_tabular():
     yield ds
 
 
+@pytest.fixture
+def mock_dataset_tabular_alternative():
+    ds = mock.MagicMock(datasets.TabularDataset)
+    ds.name = _TEST_DATASET_NAME
+    ds._latest_future = None
+    ds._exception = None
+    ds._gca_resource = gca_dataset.Dataset(
+        display_name=_TEST_DATASET_DISPLAY_NAME,
+        metadata_schema_uri=_TEST_METADATA_SCHEMA_URI_TABULAR,
+        labels={},
+        name=_TEST_DATASET_NAME,
+        metadata={},
+    )
+    ds.column_names = _TEST_TRAINING_COLUMN_NAMES_ALTERNATIVE
+
+    yield ds
+
+
 @pytest.fixture
 def mock_dataset_nontabular():
     ds = mock.MagicMock(datasets.ImageDataset)
@@ -234,24 +282,11 @@ def test_run_call_pipeline_service_create(
             encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME,
         )
 
-        column_specs = training_jobs.AutoMLTabularTrainingJob.get_auto_column_specs(
-            dataset=mock_dataset_tabular, target_column=_TEST_TRAINING_TARGET_COLUMN,
-        )
-
-        assert column_specs == _TEST_TRAINING_COLUMN_SPECS
-        column_specs[
-            _TEST_TRAINING_COLUMN_NAMES[0]
-        ] = aiplatform.automl.tabular.column_data_types.NUMERIC
-        assert column_specs[_TEST_TRAINING_COLUMN_NAMES[0]] == "numeric"
-        column_specs[
-            _TEST_TRAINING_COLUMN_NAMES[0]
-        ] = aiplatform.tabular.column.data_types.AUTO
-
         job = training_jobs.AutoMLTabularTrainingJob(
             display_name=_TEST_DISPLAY_NAME,
             optimization_objective=_TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME,
             optimization_prediction_type=_TEST_TRAINING_OPTIMIZATION_PREDICTION_TYPE,
-            column_specs=column_specs,
+            column_transformations=_TEST_TRAINING_COLUMN_TRANSFORMATIONS,
             optimization_objective_recall_value=None,
             optimization_objective_precision_value=None,
         )
@@ -329,17 +364,11 @@ def test_run_call_pipeline_if_no_model_display_name(
     ):
         aiplatform.init(project=_TEST_PROJECT, staging_bucket=_TEST_BUCKET_NAME)
 
-        column_specs = training_jobs.AutoMLTabularTrainingJob.get_auto_column_specs(
-            dataset=mock_dataset_tabular, target_column=_TEST_TRAINING_TARGET_COLUMN,
-        )
-
-        assert column_specs == _TEST_TRAINING_COLUMN_SPECS
-
         job = training_jobs.AutoMLTabularTrainingJob(
             display_name=_TEST_DISPLAY_NAME,
             optimization_objective=_TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME,
             optimization_prediction_type=_TEST_TRAINING_OPTIMIZATION_PREDICTION_TYPE,
-            column_specs=column_specs,
+            column_transformations=_TEST_TRAINING_COLUMN_TRANSFORMATIONS,
             optimization_objective_recall_value=None,
             optimization_objective_precision_value=None,
             training_encryption_spec_key_name=_TEST_PIPELINE_ENCRYPTION_KEY_NAME,
@@ -541,12 +570,262 @@ def test_run_call_pipeline_service_create_if_set_additional_experiments(
             training_pipeline=true_training_pipeline,
         )
 
+    @pytest.mark.parametrize("sync", [True, False])
+    def test_run_call_pipeline_service_create_with_column_specs(
+        self,
+        mock_pipeline_service_create,
+        mock_pipeline_service_get,
+        mock_dataset_tabular_alternative,
+        mock_model_service_get,
+        sync,
+    ):
+        aiplatform.init(project=_TEST_PROJECT, staging_bucket=_TEST_BUCKET_NAME)
+
+        column_specs = training_jobs.AutoMLTabularTrainingJob.get_auto_column_specs(
+            dataset=mock_dataset_tabular_alternative,
+            target_column=_TEST_TRAINING_TARGET_COLUMN,
+        )
+
+        assert column_specs == _TEST_TRAINING_COLUMN_SPECS
+
+        job = training_jobs.AutoMLTabularTrainingJob(
+            display_name=_TEST_DISPLAY_NAME,
+            optimization_objective=_TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME,
+            optimization_prediction_type=_TEST_TRAINING_OPTIMIZATION_PREDICTION_TYPE,
+            column_specs=column_specs,
+            optimization_objective_recall_value=None,
+            optimization_objective_precision_value=None,
+        )
+
+        model_from_job = job.run(
+            dataset=mock_dataset_tabular_alternative,
+            target_column=_TEST_TRAINING_TARGET_COLUMN,
+            model_display_name=_TEST_MODEL_DISPLAY_NAME,
+            training_fraction_split=_TEST_TRAINING_FRACTION_SPLIT,
+            validation_fraction_split=_TEST_VALIDATION_FRACTION_SPLIT,
+            test_fraction_split=_TEST_TEST_FRACTION_SPLIT,
+            predefined_split_column_name=_TEST_PREDEFINED_SPLIT_COLUMN_NAME,
+            weight_column=_TEST_TRAINING_WEIGHT_COLUMN,
+            budget_milli_node_hours=_TEST_TRAINING_BUDGET_MILLI_NODE_HOURS,
+            disable_early_stopping=_TEST_TRAINING_DISABLE_EARLY_STOPPING,
+            sync=sync,
+        )
+
+        if not sync:
+            model_from_job.wait()
+
+        true_fraction_split = gca_training_pipeline.FractionSplit(
+            training_fraction=_TEST_TRAINING_FRACTION_SPLIT,
+            validation_fraction=_TEST_VALIDATION_FRACTION_SPLIT,
+            test_fraction=_TEST_TEST_FRACTION_SPLIT,
+        )
+
+        true_managed_model = gca_model.Model(
+            display_name=_TEST_MODEL_DISPLAY_NAME,
+            encryption_spec=_TEST_DEFAULT_ENCRYPTION_SPEC,
+        )
+
+        true_input_data_config = gca_training_pipeline.InputDataConfig(
+            fraction_split=true_fraction_split,
+            predefined_split=gca_training_pipeline.PredefinedSplit(
+                key=_TEST_PREDEFINED_SPLIT_COLUMN_NAME
+            ),
+            dataset_id=mock_dataset_tabular_alternative.name,
+        )
+
+        true_training_pipeline = gca_training_pipeline.TrainingPipeline(
+            display_name=_TEST_DISPLAY_NAME,
+            training_task_definition=schema.training_job.definition.automl_tabular,
+            training_task_inputs=_TEST_TRAINING_TASK_INPUTS_ALTERNATIVE,
+            model_to_upload=true_managed_model,
+            input_data_config=true_input_data_config,
+            encryption_spec=_TEST_DEFAULT_ENCRYPTION_SPEC,
+        )
+
+        mock_pipeline_service_create.assert_called_once_with(
+            parent=initializer.global_config.common_location_path(),
+            training_pipeline=true_training_pipeline,
+        )
+
+    @pytest.mark.parametrize("sync", [True, False])
+    # Should default to column_transformation when column_specs also passed
+    def test_run_call_pipeline_service_create_with_column_specs_and_transformations(
+        self,
+        mock_pipeline_service_create,
+        mock_pipeline_service_get,
+        mock_dataset_tabular,
+        mock_dataset_alternative,
+        mock_model_service_get,
+        sync,
+    ):
+        aiplatform.init(
+            project=_TEST_PROJECT,
+            staging_bucket=_TEST_BUCKET_NAME,
+            encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME,
+        )
+
+        column_specs = training_jobs.AutoMLTabularTrainingJob.get_auto_column_specs(
+            dataset=mock_dataset_tabular_alternative,
+            target_column=_TEST_TRAINING_TARGET_COLUMN,
+        )
+
+        assert column_specs == _TEST_TRAINING_COLUMN_SPECS
+
+        job = training_jobs.AutoMLTabularTrainingJob(
+            display_name=_TEST_DISPLAY_NAME,
+            optimization_objective=_TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME,
+            optimization_prediction_type=_TEST_TRAINING_OPTIMIZATION_PREDICTION_TYPE,
+            column_transformations=_TEST_TRAINING_COLUMN_TRANSFORMATIONS,
+            column_specs=column_specs,
+            optimization_objective_recall_value=None,
+            optimization_objective_precision_value=None,
+        )
+
+        model_from_job = job.run(
+            dataset=mock_dataset_tabular,
+            target_column=_TEST_TRAINING_TARGET_COLUMN,
+            model_display_name=_TEST_MODEL_DISPLAY_NAME,
+            training_fraction_split=_TEST_TRAINING_FRACTION_SPLIT,
+            validation_fraction_split=_TEST_VALIDATION_FRACTION_SPLIT,
+            test_fraction_split=_TEST_TEST_FRACTION_SPLIT,
+            predefined_split_column_name=_TEST_PREDEFINED_SPLIT_COLUMN_NAME,
+            weight_column=_TEST_TRAINING_WEIGHT_COLUMN,
+            budget_milli_node_hours=_TEST_TRAINING_BUDGET_MILLI_NODE_HOURS,
+            disable_early_stopping=_TEST_TRAINING_DISABLE_EARLY_STOPPING,
+            sync=sync,
+        )
+
+        if not sync:
+            model_from_job.wait()
+
+        true_fraction_split = gca_training_pipeline.FractionSplit(
+            training_fraction=_TEST_TRAINING_FRACTION_SPLIT,
+            validation_fraction=_TEST_VALIDATION_FRACTION_SPLIT,
+            test_fraction=_TEST_TEST_FRACTION_SPLIT,
+        )
+
+        true_managed_model = gca_model.Model(
+            display_name=_TEST_MODEL_DISPLAY_NAME,
+            encryption_spec=_TEST_DEFAULT_ENCRYPTION_SPEC,
+        )
+
+        true_input_data_config = gca_training_pipeline.InputDataConfig(
+            fraction_split=true_fraction_split,
+            predefined_split=gca_training_pipeline.PredefinedSplit(
+                key=_TEST_PREDEFINED_SPLIT_COLUMN_NAME
+            ),
+            dataset_id=mock_dataset_tabular.name,
+        )
+
+        true_training_pipeline = gca_training_pipeline.TrainingPipeline(
+            display_name=_TEST_DISPLAY_NAME,
+            training_task_definition=schema.training_job.definition.automl_tabular,
+            training_task_inputs=_TEST_TRAINING_TASK_INPUTS,
+            model_to_upload=true_managed_model,
+            input_data_config=true_input_data_config,
+            encryption_spec=_TEST_DEFAULT_ENCRYPTION_SPEC,
+        )
+
+        mock_pipeline_service_create.assert_called_once_with(
+            parent=initializer.global_config.common_location_path(),
+            training_pipeline=true_training_pipeline,
+        )
+
+    @pytest.mark.parametrize("sync", [True, False])
+    def test_run_call_pipeline_service_create_with_column_specs_not_auto(
+        self,
+        mock_pipeline_service_create,
+        mock_pipeline_service_get,
+        mock_dataset_tabular_alternative,
+        mock_model_service_get,
+        sync,
+    ):
+        aiplatform.init(project=_TEST_PROJECT, staging_bucket=_TEST_BUCKET_NAME)
+
+        column_specs = training_jobs.AutoMLTabularTrainingJob.get_auto_column_specs(
+            dataset=mock_dataset_tabular_alternative,
+            target_column=_TEST_TRAINING_TARGET_COLUMN,
+        )
+        column_specs[
+            _TEST_TRAINING_COLUMN_NAMES_ALTERNATIVE[0]
+        ] = training_jobs.AutoMLTabularTrainingJob.column_data_types.NUMERIC
+        column_specs[
+            _TEST_TRAINING_COLUMN_NAMES_ALTERNATIVE[1]
+        ] = training_jobs.AutoMLTabularTrainingJob.column_data_types.CATEGORICAL
+        column_specs[
+            _TEST_TRAINING_COLUMN_NAMES_ALTERNATIVE[2]
+        ] = training_jobs.AutoMLTabularTrainingJob.column_data_types.TEXT
+
+        assert (
+            column_specs == _TEST_TRAINING_COLUMN_TRANSFORMATIONS_ALTERNATIVE_NOT_AUTO
+        )
+
+        job = training_jobs.AutoMLTabularTrainingJob(
+            display_name=_TEST_DISPLAY_NAME,
+            optimization_objective=_TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME,
+            optimization_prediction_type=_TEST_TRAINING_OPTIMIZATION_PREDICTION_TYPE,
+            column_specs=column_specs,
+            optimization_objective_recall_value=None,
+            optimization_objective_precision_value=None,
+        )
+
+        model_from_job = job.run(
+            dataset=mock_dataset_tabular_alternative,
+            target_column=_TEST_TRAINING_TARGET_COLUMN,
+            model_display_name=_TEST_MODEL_DISPLAY_NAME,
+            training_fraction_split=_TEST_TRAINING_FRACTION_SPLIT,
+            validation_fraction_split=_TEST_VALIDATION_FRACTION_SPLIT,
+            test_fraction_split=_TEST_TEST_FRACTION_SPLIT,
+            predefined_split_column_name=_TEST_PREDEFINED_SPLIT_COLUMN_NAME,
+            weight_column=_TEST_TRAINING_WEIGHT_COLUMN,
+            budget_milli_node_hours=_TEST_TRAINING_BUDGET_MILLI_NODE_HOURS,
+            disable_early_stopping=_TEST_TRAINING_DISABLE_EARLY_STOPPING,
+            sync=sync,
+        )
+
+        if not sync:
+            model_from_job.wait()
+
+        true_fraction_split = gca_training_pipeline.FractionSplit(
+            training_fraction=_TEST_TRAINING_FRACTION_SPLIT,
+            validation_fraction=_TEST_VALIDATION_FRACTION_SPLIT,
+            test_fraction=_TEST_TEST_FRACTION_SPLIT,
+        )
+
+        true_managed_model = gca_model.Model(
+            display_name=_TEST_MODEL_DISPLAY_NAME,
+            encryption_spec=_TEST_DEFAULT_ENCRYPTION_SPEC,
+        )
+
+        true_input_data_config = gca_training_pipeline.InputDataConfig(
+            fraction_split=true_fraction_split,
+            predefined_split=gca_training_pipeline.PredefinedSplit(
+                key=_TEST_PREDEFINED_SPLIT_COLUMN_NAME
+            ),
+            dataset_id=mock_dataset_tabular_alternative.name,
+        )
+
+        true_training_pipeline = gca_training_pipeline.TrainingPipeline(
+            display_name=_TEST_DISPLAY_NAME,
+            training_task_definition=schema.training_job.definition.automl_tabular,
+            training_task_inputs=_TEST_TRAINING_TASK_INPUTS_ALTERNATIVE_NOT_AUTO,
+            model_to_upload=true_managed_model,
+            input_data_config=true_input_data_config,
+            encryption_spec=_TEST_DEFAULT_ENCRYPTION_SPEC,
+        )
+
+        mock_pipeline_service_create.assert_called_once_with(
+            parent=initializer.global_config.common_location_path(),
+            training_pipeline=true_training_pipeline,
+        )
+
     @pytest.mark.usefixtures(
         "mock_pipeline_service_create",
         "mock_pipeline_service_get",
         "mock_model_service_get",
     )
     @pytest.mark.parametrize("sync", [True, False])
+    # Also acts as a custom column_transformations test as it should not error during first call
     def test_run_called_twice_raises(self, mock_dataset_tabular, sync):
         aiplatform.init(project=_TEST_PROJECT, staging_bucket=_TEST_BUCKET_NAME)
 

From 24d12aa958c3c64ebbf83608ea579dd3c8e662ba Mon Sep 17 00:00:00 2001
From: Torry Yang <torryyang@google.com>
Date: Fri, 9 Jul 2021 21:28:06 -0400
Subject: [PATCH 16/22] bugs

---
 google/cloud/aiplatform/training_jobs.py                   | 2 +-
 tests/unit/aiplatform/test_automl_tabular_training_jobs.py | 6 +-----
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/google/cloud/aiplatform/training_jobs.py b/google/cloud/aiplatform/training_jobs.py
index 83926e4d7f..851b2a2b47 100644
--- a/google/cloud/aiplatform/training_jobs.py
+++ b/google/cloud/aiplatform/training_jobs.py
@@ -2658,7 +2658,7 @@ def __init__(
             model_encryption_spec_key_name=model_encryption_spec_key_name,
         )
         # user populated transformations
-        if self._column_transformations is not None and self._column_specs is not None:
+        if column_transformations is not None and column_specs is not None:
             _LOGGER.info(
                 "column_transformations and column_specs were both passed. column_transformations was used."
             )
diff --git a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
index 1ed9d183bc..d15b64075c 100644
--- a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
+++ b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
@@ -654,7 +654,7 @@ def test_run_call_pipeline_service_create_with_column_specs_and_transformations(
         mock_pipeline_service_create,
         mock_pipeline_service_get,
         mock_dataset_tabular,
-        mock_dataset_alternative,
+        mock_dataset_tabular_alternative,
         mock_model_service_get,
         sync,
     ):
@@ -756,10 +756,6 @@ def test_run_call_pipeline_service_create_with_column_specs_not_auto(
             _TEST_TRAINING_COLUMN_NAMES_ALTERNATIVE[2]
         ] = training_jobs.AutoMLTabularTrainingJob.column_data_types.TEXT
 
-        assert (
-            column_specs == _TEST_TRAINING_COLUMN_TRANSFORMATIONS_ALTERNATIVE_NOT_AUTO
-        )
-
         job = training_jobs.AutoMLTabularTrainingJob(
             display_name=_TEST_DISPLAY_NAME,
             optimization_objective=_TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME,

From d26513486702c544e336e76be1a93516900300a2 Mon Sep 17 00:00:00 2001
From: Torry Yang <torryyang@google.com>
Date: Fri, 9 Jul 2021 22:06:21 -0400
Subject: [PATCH 17/22] more bugs

---
 google/cloud/aiplatform/training_jobs.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/google/cloud/aiplatform/training_jobs.py b/google/cloud/aiplatform/training_jobs.py
index 851b2a2b47..672715ccf5 100644
--- a/google/cloud/aiplatform/training_jobs.py
+++ b/google/cloud/aiplatform/training_jobs.py
@@ -2667,6 +2667,8 @@ def __init__(
             self._column_specs = None
         elif column_specs is not None:
             self._column_specs = column_specs
+        else:
+            self._column_specs = None
         self._optimization_objective = optimization_objective
         self._optimization_prediction_type = optimization_prediction_type
         self._optimization_objective_recall_value = optimization_objective_recall_value
@@ -2895,7 +2897,8 @@ def _run(
         # convert column specs to column transformations
         if self._column_specs is not None:
             self._column_transformations = [
-                {item[1]: {"column_name": item[0]}} for item in self._column_specs.items
+                {item[1]: {"column_name": item[0]}}
+                for item in self._column_specs.items()
             ]
         # auto-populate transformations
         if self._column_transformations is None:

From 6c2d903751dbf817469a7571ba70172c6302c720 Mon Sep 17 00:00:00 2001
From: Torry Yang <torryyang@google.com>
Date: Sat, 10 Jul 2021 00:29:18 -0400
Subject: [PATCH 18/22] fix tests

---
 .../cloud/aiplatform/datasets/time_series_dataset.py |  1 -
 google/cloud/aiplatform/training_jobs.py             |  7 +++----
 .../aiplatform/test_automl_tabular_training_jobs.py  | 12 ++----------
 3 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/google/cloud/aiplatform/datasets/time_series_dataset.py b/google/cloud/aiplatform/datasets/time_series_dataset.py
index 8070daf10f..1a5d62bb39 100644
--- a/google/cloud/aiplatform/datasets/time_series_dataset.py
+++ b/google/cloud/aiplatform/datasets/time_series_dataset.py
@@ -26,7 +26,6 @@
 from google.cloud.aiplatform import utils
 
 
-# TODO: extend tabular dataset
 class TimeSeriesDataset(datasets._Dataset):
     """Managed time series dataset resource for Vertex AI"""
 
diff --git a/google/cloud/aiplatform/training_jobs.py b/google/cloud/aiplatform/training_jobs.py
index 672715ccf5..fde1e829fc 100644
--- a/google/cloud/aiplatform/training_jobs.py
+++ b/google/cloud/aiplatform/training_jobs.py
@@ -2663,12 +2663,14 @@ def __init__(
                 "column_transformations and column_specs were both passed. column_transformations was used."
             )
         if column_transformations is not None:
-            self._column_transformations = column_transformations
             self._column_specs = None
+            self._column_transformations = column_transformations
         elif column_specs is not None:
             self._column_specs = column_specs
+            self._column_transformations = None
         else:
             self._column_specs = None
+            self._column_transformations = None
         self._optimization_objective = optimization_objective
         self._optimization_prediction_type = optimization_prediction_type
         self._optimization_objective_recall_value = optimization_objective_recall_value
@@ -2887,9 +2889,6 @@ def _run(
         Returns:
             model: The trained Vertex AI Model resource or None if training did not
                 produce an Vertex AI Model.
-        Raises:
-            ValueError: When column doesn't exist in dataset.
-            ValueError: When target column is in transformations.
         """
 
         training_task_definition = schema.training_job.definition.automl_tabular
diff --git a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
index d15b64075c..0fd1f57101 100644
--- a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
+++ b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
@@ -620,10 +620,7 @@ def test_run_call_pipeline_service_create_with_column_specs(
             test_fraction=_TEST_TEST_FRACTION_SPLIT,
         )
 
-        true_managed_model = gca_model.Model(
-            display_name=_TEST_MODEL_DISPLAY_NAME,
-            encryption_spec=_TEST_DEFAULT_ENCRYPTION_SPEC,
-        )
+        true_managed_model = gca_model.Model(display_name=_TEST_MODEL_DISPLAY_NAME)
 
         true_input_data_config = gca_training_pipeline.InputDataConfig(
             fraction_split=true_fraction_split,
@@ -639,7 +636,6 @@ def test_run_call_pipeline_service_create_with_column_specs(
             training_task_inputs=_TEST_TRAINING_TASK_INPUTS_ALTERNATIVE,
             model_to_upload=true_managed_model,
             input_data_config=true_input_data_config,
-            encryption_spec=_TEST_DEFAULT_ENCRYPTION_SPEC,
         )
 
         mock_pipeline_service_create.assert_called_once_with(
@@ -788,10 +784,7 @@ def test_run_call_pipeline_service_create_with_column_specs_not_auto(
             test_fraction=_TEST_TEST_FRACTION_SPLIT,
         )
 
-        true_managed_model = gca_model.Model(
-            display_name=_TEST_MODEL_DISPLAY_NAME,
-            encryption_spec=_TEST_DEFAULT_ENCRYPTION_SPEC,
-        )
+        true_managed_model = gca_model.Model(display_name=_TEST_MODEL_DISPLAY_NAME)
 
         true_input_data_config = gca_training_pipeline.InputDataConfig(
             fraction_split=true_fraction_split,
@@ -807,7 +800,6 @@ def test_run_call_pipeline_service_create_with_column_specs_not_auto(
             training_task_inputs=_TEST_TRAINING_TASK_INPUTS_ALTERNATIVE_NOT_AUTO,
             model_to_upload=true_managed_model,
             input_data_config=true_input_data_config,
-            encryption_spec=_TEST_DEFAULT_ENCRYPTION_SPEC,
         )
 
         mock_pipeline_service_create.assert_called_once_with(

From dd2f831f830be8be354fa504b7fa1e5da85a61a3 Mon Sep 17 00:00:00 2001
From: Torry Yang <torryyang@google.com>
Date: Wed, 14 Jul 2021 15:56:23 -0400
Subject: [PATCH 19/22] address feedback

---
 google/cloud/aiplatform/training_jobs.py      | 29 ++++----
 .../test_automl_tabular_training_jobs.py      | 69 +++----------------
 2 files changed, 24 insertions(+), 74 deletions(-)

diff --git a/google/cloud/aiplatform/training_jobs.py b/google/cloud/aiplatform/training_jobs.py
index fde1e829fc..c9642f1dc8 100644
--- a/google/cloud/aiplatform/training_jobs.py
+++ b/google/cloud/aiplatform/training_jobs.py
@@ -2542,7 +2542,7 @@ def __init__(
             display_name="my_display_name",
             optimization_prediction_type="classification",
             optimization_objective="minimize-log-loss",
-            column_specs=my_column_specs,
+            column_specs={"column_1": "auto", "column_2": "numeric"},
         )
 
         Args:
@@ -2586,11 +2586,12 @@ def __init__(
                 "minimize-mae" - Minimize mean-absolute error (MAE).
                 "minimize-rmsle" - Minimize root-mean-squared log error (RMSLE).
             column_specs (Dict[str, str]):
-                Optional. Transformations to apply to the input columns (i.e. columns other
-                than the targetColumn). Each transformation may produce multiple
-                result values from the column's value, and all are used for training.
+                Optional. Alternative to column_transformations where the keys of the dict
+                are column names and their respective values are one of
+                AutoMLTabularTrainingJob.column_data_types.
                 When creating transformation for BigQuery Struct column, the column
-                should be flattened using "." as the delimiter.
+                should be flattened using "." as the delimiter. Only columns with no child
+                should have a transformation.
                 If an input column has no transformations on it, such a column is
                 ignored by the training, except for the targetColumn, which should have
                 no transformations defined on.
@@ -2600,7 +2601,8 @@ def __init__(
                 than the targetColumn). Each transformation may produce multiple
                 result values from the column's value, and all are used for training.
                 When creating transformation for BigQuery Struct column, the column
-                should be flattened using "." as the delimiter.
+                should be flattened using "." as the delimiter. Only columns with no child
+                should have a transformation.
                 If an input column has no transformations on it, such a column is
                 ignored by the training, except for the targetColumn, which should have
                 no transformations defined on.
@@ -2648,6 +2650,9 @@ def __init__(
                 If set, the trained Model will be secured by this key.
 
                 Overrides encryption_spec_key_name set in aiplatform.init.
+
+            Raises:
+                ValueError: When both column_transforations and column_specs were passed
         """
         super().__init__(
             display_name=display_name,
@@ -2659,8 +2664,8 @@ def __init__(
         )
         # user populated transformations
         if column_transformations is not None and column_specs is not None:
-            _LOGGER.info(
-                "column_transformations and column_specs were both passed. column_transformations was used."
+            raise ValueError(
+                "Both column_transformations and column_specs were passed. Only one is allowed."
             )
         if column_transformations is not None:
             self._column_specs = None
@@ -2888,7 +2893,7 @@ def _run(
 
         Returns:
             model: The trained Vertex AI Model resource or None if training did not
-                produce an Vertex AI Model.
+                produce a Vertex AI Model.
         """
 
         training_task_definition = schema.training_job.definition.automl_tabular
@@ -2994,13 +2999,7 @@ def get_auto_column_specs(
         Returns:
             Dict[str, str]
                 Column names as keys and 'auto' as values
-
-        Raises:
-            RuntimeError: When no valid source is found.
-            ValueError: When target_column is not in dataset
         """
-        if target_column not in dataset.column_names:
-            raise ValueError("Target column not in dataset.")
         column_names = [
             column for column in dataset.column_names if column != target_column
         ]
diff --git a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
index 0fd1f57101..df3080522e 100644
--- a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
+++ b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
@@ -667,65 +667,16 @@ def test_run_call_pipeline_service_create_with_column_specs_and_transformations(
 
         assert column_specs == _TEST_TRAINING_COLUMN_SPECS
 
-        job = training_jobs.AutoMLTabularTrainingJob(
-            display_name=_TEST_DISPLAY_NAME,
-            optimization_objective=_TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME,
-            optimization_prediction_type=_TEST_TRAINING_OPTIMIZATION_PREDICTION_TYPE,
-            column_transformations=_TEST_TRAINING_COLUMN_TRANSFORMATIONS,
-            column_specs=column_specs,
-            optimization_objective_recall_value=None,
-            optimization_objective_precision_value=None,
-        )
-
-        model_from_job = job.run(
-            dataset=mock_dataset_tabular,
-            target_column=_TEST_TRAINING_TARGET_COLUMN,
-            model_display_name=_TEST_MODEL_DISPLAY_NAME,
-            training_fraction_split=_TEST_TRAINING_FRACTION_SPLIT,
-            validation_fraction_split=_TEST_VALIDATION_FRACTION_SPLIT,
-            test_fraction_split=_TEST_TEST_FRACTION_SPLIT,
-            predefined_split_column_name=_TEST_PREDEFINED_SPLIT_COLUMN_NAME,
-            weight_column=_TEST_TRAINING_WEIGHT_COLUMN,
-            budget_milli_node_hours=_TEST_TRAINING_BUDGET_MILLI_NODE_HOURS,
-            disable_early_stopping=_TEST_TRAINING_DISABLE_EARLY_STOPPING,
-            sync=sync,
-        )
-
-        if not sync:
-            model_from_job.wait()
-
-        true_fraction_split = gca_training_pipeline.FractionSplit(
-            training_fraction=_TEST_TRAINING_FRACTION_SPLIT,
-            validation_fraction=_TEST_VALIDATION_FRACTION_SPLIT,
-            test_fraction=_TEST_TEST_FRACTION_SPLIT,
-        )
-
-        true_managed_model = gca_model.Model(
-            display_name=_TEST_MODEL_DISPLAY_NAME,
-            encryption_spec=_TEST_DEFAULT_ENCRYPTION_SPEC,
-        )
-
-        true_input_data_config = gca_training_pipeline.InputDataConfig(
-            fraction_split=true_fraction_split,
-            predefined_split=gca_training_pipeline.PredefinedSplit(
-                key=_TEST_PREDEFINED_SPLIT_COLUMN_NAME
-            ),
-            dataset_id=mock_dataset_tabular.name,
-        )
-
-        true_training_pipeline = gca_training_pipeline.TrainingPipeline(
-            display_name=_TEST_DISPLAY_NAME,
-            training_task_definition=schema.training_job.definition.automl_tabular,
-            training_task_inputs=_TEST_TRAINING_TASK_INPUTS,
-            model_to_upload=true_managed_model,
-            input_data_config=true_input_data_config,
-            encryption_spec=_TEST_DEFAULT_ENCRYPTION_SPEC,
-        )
-
-        mock_pipeline_service_create.assert_called_once_with(
-            parent=initializer.global_config.common_location_path(),
-            training_pipeline=true_training_pipeline,
-        )
+        with pytest.raises(ValueError):
+            training_jobs.AutoMLTabularTrainingJob(
+                display_name=_TEST_DISPLAY_NAME,
+                optimization_objective=_TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME,
+                optimization_prediction_type=_TEST_TRAINING_OPTIMIZATION_PREDICTION_TYPE,
+                column_transformations=_TEST_TRAINING_COLUMN_TRANSFORMATIONS,
+                column_specs=column_specs,
+                optimization_objective_recall_value=None,
+                optimization_objective_precision_value=None,
+            )
 
     @pytest.mark.parametrize("sync", [True, False])
     def test_run_call_pipeline_service_create_with_column_specs_not_auto(

From 8239bbd5e9a00afe027d003fd8e8b979a2e48523 Mon Sep 17 00:00:00 2001
From: Torry Yang <torryyang@google.com>
Date: Wed, 14 Jul 2021 18:12:23 -0400
Subject: [PATCH 20/22] one more test

---
 .../test_automl_tabular_training_jobs.py      | 45 +++++++------------
 1 file changed, 16 insertions(+), 29 deletions(-)

diff --git a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
index df3080522e..eeb233ec1a 100644
--- a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
+++ b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
@@ -572,12 +572,7 @@ def test_run_call_pipeline_service_create_if_set_additional_experiments(
 
     @pytest.mark.parametrize("sync", [True, False])
     def test_run_call_pipeline_service_create_with_column_specs(
-        self,
-        mock_pipeline_service_create,
-        mock_pipeline_service_get,
-        mock_dataset_tabular_alternative,
-        mock_model_service_get,
-        sync,
+        self, mock_pipeline_service_create, mock_dataset_tabular_alternative, sync,
     ):
         aiplatform.init(project=_TEST_PROJECT, staging_bucket=_TEST_BUCKET_NAME)
 
@@ -644,21 +639,10 @@ def test_run_call_pipeline_service_create_with_column_specs(
         )
 
     @pytest.mark.parametrize("sync", [True, False])
-    # Should default to column_transformation when column_specs also passed
-    def test_run_call_pipeline_service_create_with_column_specs_and_transformations(
-        self,
-        mock_pipeline_service_create,
-        mock_pipeline_service_get,
-        mock_dataset_tabular,
-        mock_dataset_tabular_alternative,
-        mock_model_service_get,
-        sync,
+    def test_call_pipeline_service_create_with_column_specs_and_transformations_raises(
+        self, mock_dataset_tabular_alternative, sync,
     ):
-        aiplatform.init(
-            project=_TEST_PROJECT,
-            staging_bucket=_TEST_BUCKET_NAME,
-            encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME,
-        )
+        aiplatform.init()
 
         column_specs = training_jobs.AutoMLTabularTrainingJob.get_auto_column_specs(
             dataset=mock_dataset_tabular_alternative,
@@ -670,22 +654,25 @@ def test_run_call_pipeline_service_create_with_column_specs_and_transformations(
         with pytest.raises(ValueError):
             training_jobs.AutoMLTabularTrainingJob(
                 display_name=_TEST_DISPLAY_NAME,
-                optimization_objective=_TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME,
                 optimization_prediction_type=_TEST_TRAINING_OPTIMIZATION_PREDICTION_TYPE,
                 column_transformations=_TEST_TRAINING_COLUMN_TRANSFORMATIONS,
                 column_specs=column_specs,
-                optimization_objective_recall_value=None,
-                optimization_objective_precision_value=None,
+            )
+
+    @pytest.mark.parametrize("sync", [True, False])
+    def test_get_column_specs_no_target_raises(
+        self, mock_dataset_tabular_alternative, sync,
+    ):
+        aiplatform.init()
+
+        with pytest.raises(TypeError):
+            training_jobs.AutoMLTabularTrainingJob.get_auto_column_specs(
+                dataset=mock_dataset_tabular_alternative
             )
 
     @pytest.mark.parametrize("sync", [True, False])
     def test_run_call_pipeline_service_create_with_column_specs_not_auto(
-        self,
-        mock_pipeline_service_create,
-        mock_pipeline_service_get,
-        mock_dataset_tabular_alternative,
-        mock_model_service_get,
-        sync,
+        self, mock_pipeline_service_create, mock_dataset_tabular_alternative, sync,
     ):
         aiplatform.init(project=_TEST_PROJECT, staging_bucket=_TEST_BUCKET_NAME)
 

From 1fefacc8c3f07da8447e179fe2500dcbac14092b Mon Sep 17 00:00:00 2001
From: Torry Yang <torryyang@google.com>
Date: Wed, 14 Jul 2021 19:52:10 -0400
Subject: [PATCH 21/22] fix tests

---
 .../test_automl_tabular_training_jobs.py           | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
index eeb233ec1a..413566440f 100644
--- a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
+++ b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py
@@ -572,7 +572,12 @@ def test_run_call_pipeline_service_create_if_set_additional_experiments(
 
     @pytest.mark.parametrize("sync", [True, False])
     def test_run_call_pipeline_service_create_with_column_specs(
-        self, mock_pipeline_service_create, mock_dataset_tabular_alternative, sync,
+        self,
+        mock_pipeline_service_create,
+        mock_pipeline_service_get,
+        mock_dataset_tabular_alternative,
+        mock_model_service_get,
+        sync,
     ):
         aiplatform.init(project=_TEST_PROJECT, staging_bucket=_TEST_BUCKET_NAME)
 
@@ -672,7 +677,12 @@ def test_get_column_specs_no_target_raises(
 
     @pytest.mark.parametrize("sync", [True, False])
     def test_run_call_pipeline_service_create_with_column_specs_not_auto(
-        self, mock_pipeline_service_create, mock_dataset_tabular_alternative, sync,
+        self,
+        mock_pipeline_service_create,
+        mock_pipeline_service_get,
+        mock_dataset_tabular_alternative,
+        mock_model_service_get,
+        sync,
     ):
         aiplatform.init(project=_TEST_PROJECT, staging_bucket=_TEST_BUCKET_NAME)
 

From 322cc6628b90a52312db0b8e46a544e07060e315 Mon Sep 17 00:00:00 2001
From: Torry Yang <torryyang@google.com>
Date: Thu, 15 Jul 2021 14:16:50 -0400
Subject: [PATCH 22/22] address feedback

---
 google/cloud/aiplatform/training_jobs.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/google/cloud/aiplatform/training_jobs.py b/google/cloud/aiplatform/training_jobs.py
index c9642f1dc8..1d738aa7a6 100644
--- a/google/cloud/aiplatform/training_jobs.py
+++ b/google/cloud/aiplatform/training_jobs.py
@@ -18,6 +18,7 @@
 import datetime
 import time
 from typing import Dict, List, Optional, Sequence, Tuple, Union
+import warnings
 
 import abc
 
@@ -2607,6 +2608,7 @@ def __init__(
                 ignored by the training, except for the targetColumn, which should have
                 no transformations defined on.
                 Only one of column_transformations or column_specs should be passed.
+                Consider using column_specs as column_transformations will be deprecated eventually.
             optimization_objective_recall_value (float):
                 Optional. Required when maximize-precision-at-recall optimizationObjective was
                 picked, represents the recall value at which the optimization is done.
@@ -2668,13 +2670,19 @@ def __init__(
                 "Both column_transformations and column_specs were passed. Only one is allowed."
             )
         if column_transformations is not None:
-            self._column_specs = None
             self._column_transformations = column_transformations
+            warnings.simplefilter("always", DeprecationWarning)
+            warnings.warn(
+                "consider using column_specs instead. column_transformations will be deprecated in the future.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
         elif column_specs is not None:
-            self._column_specs = column_specs
-            self._column_transformations = None
+            self._column_transformations = [
+                {transformation: {"column_name": column_name}}
+                for column_name, transformation in column_specs.items()
+            ]
         else:
-            self._column_specs = None
             self._column_transformations = None
         self._optimization_objective = optimization_objective
         self._optimization_prediction_type = optimization_prediction_type
@@ -2898,12 +2906,6 @@ def _run(
 
         training_task_definition = schema.training_job.definition.automl_tabular
 
-        # convert column specs to column transformations
-        if self._column_specs is not None:
-            self._column_transformations = [
-                {item[1]: {"column_name": item[0]}}
-                for item in self._column_specs.items()
-            ]
         # auto-populate transformations
         if self._column_transformations is None:
             _LOGGER.info(