From f935b764a8ce123fa60a1793080f6594ee952427 Mon Sep 17 00:00:00 2001 From: sirtorry Date: Tue, 8 Jun 2021 01:50:49 -0400 Subject: [PATCH 01/22] add transformation_specs --- google/cloud/aiplatform/column.py | 28 ++++++++++++ .../aiplatform/datasets/tabular_dataset.py | 22 +++++++++- .../datasets/time_series_dataset.py | 3 +- google/cloud/aiplatform/training_jobs.py | 43 +++++++++++++++++-- 4 files changed, 91 insertions(+), 5 deletions(-) create mode 100644 google/cloud/aiplatform/column.py diff --git a/google/cloud/aiplatform/column.py b/google/cloud/aiplatform/column.py new file mode 100644 index 0000000000..aa13e4d609 --- /dev/null +++ b/google/cloud/aiplatform/column.py @@ -0,0 +1,28 @@ +# -*- coding: utf-8 -*- + +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Vertex AI tabular data types""" + +class data_types: + AUTO = "auto" + NUMERIC = "numeric" + CATEGORICAL = "categorical" + TIMESTAMP = "timestamp" + TEXT = "text" + REPEATED_NUMERIC = "repeated_numeric" + REPEATED_CATEGORICAL = "repeated_categorical" + REPEATED_TEXT = "repeated_text" diff --git a/google/cloud/aiplatform/datasets/tabular_dataset.py b/google/cloud/aiplatform/datasets/tabular_dataset.py index 95f1b16f98..240a3bcb23 100644 --- a/google/cloud/aiplatform/datasets/tabular_dataset.py +++ b/google/cloud/aiplatform/datasets/tabular_dataset.py @@ -18,7 +18,7 @@ import csv import logging -from typing import List, Optional, Sequence, Tuple, Union +from typing import Dict, List, Optional, Sequence, Tuple, Union from google.auth import credentials as auth_credentials @@ -39,6 +39,26 @@ class TabularDataset(datasets._Dataset): schema.dataset.metadata.tabular, ) + def auto_column_specs(self, target_column: str) -> Dict[str, str]: + """Returns a dict with all non-target columns as keys and 'auto' as values. + Args: + target_column(str): + Required. Intended target column. + Returns: + Dict[str, str] + Column names as keys and 'auto' as values + + Raises: + RuntimeError: When no valid source is found. + """ + column_names = [ + column + for column in self.column_names + if column != target_column + ] + column_specs = {column: 'auto' for column in column_names} + return column_specs + @property def column_names(self) -> List[str]: """Retrieve the columns for the dataset by extracting it from the Google Cloud Storage or diff --git a/google/cloud/aiplatform/datasets/time_series_dataset.py b/google/cloud/aiplatform/datasets/time_series_dataset.py index d5aa3dcbf2..dce0f5ec91 100644 --- a/google/cloud/aiplatform/datasets/time_series_dataset.py +++ b/google/cloud/aiplatform/datasets/time_series_dataset.py @@ -26,6 +26,7 @@ from google.cloud.aiplatform import utils +#TODO: extend tabular dataset class TimeSeriesDataset(datasets._Dataset): """Managed time series dataset resource for Vertex AI""" @@ -46,7 +47,7 @@ def create( encryption_spec_key_name: Optional[str] = None, sync: bool = True, ) -> "TimeSeriesDataset": - """Creates a new tabular dataset. + """Creates a new d dataset. Args: display_name (str): diff --git a/google/cloud/aiplatform/training_jobs.py b/google/cloud/aiplatform/training_jobs.py index 51fdb55d13..87bfc98cd6 100644 --- a/google/cloud/aiplatform/training_jobs.py +++ b/google/cloud/aiplatform/training_jobs.py @@ -2301,6 +2301,7 @@ def __init__( optimization_prediction_type: str, optimization_objective: Optional[str] = None, column_transformations: Optional[Union[Dict, List[Dict]]] = None, + column_specs: Optional[Dict[str, str]] = None, optimization_objective_recall_value: Optional[float] = None, optimization_objective_precision_value: Optional[float] = None, project: Optional[str] = None, @@ -2360,6 +2361,17 @@ def __init__( If an input column has no transformations on it, such a column is ignored by the training, except for the targetColumn, which should have no transformations defined on. + Only one of column_transformations or column_specs should be passed. + column_specs (Optional[Dict[str, str]]): + Optional. Transformations to apply to the input columns (i.e. columns other + than the targetColumn). Each transformation may produce multiple + result values from the column's value, and all are used for training. + When creating transformation for BigQuery Struct column, the column + should be flattened using "." as the delimiter. + If an input column has no transformations on it, such a column is + ignored by the training, except for the targetColumn, which should have + no transformations defined on. + Only one of column_transformations or column_specs should be passed. optimization_objective_recall_value (float): Optional. Required when maximize-precision-at-recall optimizationObjective was picked, represents the recall value at which the optimization is done. @@ -2413,6 +2425,7 @@ def __init__( model_encryption_spec_key_name=model_encryption_spec_key_name, ) self._column_transformations = column_transformations + self._column_specs = column_specs self._optimization_objective = optimization_objective self._optimization_prediction_type = optimization_prediction_type self._optimization_objective_recall_value = optimization_objective_recall_value @@ -2627,11 +2640,36 @@ def _run( Returns: model: The trained Vertex AI Model resource or None if training did not produce an Vertex AI Model. + Raises: + ValueError: When column doesn't exist in dataset. + ValueError: When target column is in transformations. """ training_task_definition = schema.training_job.definition.automl_tabular + column_transformations = None - if self._column_transformations is None: + # user populated transformations + if self._column_transformations is not None and self._column_specs is not None: + _LOGGER.info( + "column_transformations and column_specs were both passed. column_transformations was used." + ) + if self._column_transformations is not None: + column_transformations = self._column_transformations + if self._column_specs is not None and column_transformations is None: + column_transformations = [ + {self._column_specs[column]: {"column_name": column}} for column in self._column_specs + ] + if column_transformations is not None: + column_names = dataset.column_names + for transformation in column_transformations: + for data_type in transformation: + column = transformation[data_type][column_name] + if column not in column_names: + raise ValueError(f"'{column}' is not in the dataset.") + if column is target_column: + raise ValueError("Target column is in transformations.") + # auto-populate transformations + if column_transformations is None: _LOGGER.info( "No column transformations provided, so now retrieving columns from dataset in order to set default column transformations." ) @@ -2649,8 +2687,6 @@ def _run( "The column transformation of type 'auto' was set for the following columns: %s." % column_names ) - else: - column_transformations = self._column_transformations training_task_inputs_dict = { # required inputs @@ -2707,6 +2743,7 @@ def _add_additional_experiments(self, additional_experiments: List[str]): self._additional_experiments.extend(additional_experiments) +#TODO: add tabular sugar to forecasting class AutoMLForecastingTrainingJob(_TrainingJob): _supported_training_schemas = (schema.training_job.definition.automl_forecasting,) From f6edc10baf6ea7d293031663d444705e06facc4e Mon Sep 17 00:00:00 2001 From: sirtorry Date: Tue, 8 Jun 2021 16:01:25 -0400 Subject: [PATCH 02/22] add tests --- .../aiplatform/datasets/tabular_dataset.py | 3 ++ .../test_automl_tabular_training_jobs.py | 6 +++ tests/unit/aiplatform/test_datasets.py | 38 +++++++++++++++++++ 3 files changed, 47 insertions(+) diff --git a/google/cloud/aiplatform/datasets/tabular_dataset.py b/google/cloud/aiplatform/datasets/tabular_dataset.py index 240a3bcb23..d630073642 100644 --- a/google/cloud/aiplatform/datasets/tabular_dataset.py +++ b/google/cloud/aiplatform/datasets/tabular_dataset.py @@ -50,7 +50,10 @@ def auto_column_specs(self, target_column: str) -> Dict[str, str]: Raises: RuntimeError: When no valid source is found. + ValueError: When target_column is not in dataset """ + if target_column is not in self.column_names: + raise ValueError("Target column not in dataset.") column_names = [ column for column in self.column_names diff --git a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py index 761b03b5a0..74fdc94bdf 100644 --- a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py +++ b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py @@ -50,6 +50,12 @@ {"auto": {"column_name": "petal_length"}}, {"auto": {"column_name": "petal_width"}}, ] +__TEST_TRAINING_COLUMN_SPECS = { + "sepal_width": "auto", + "sepal_length": "auto", + "sepal_width": "auto", + "sepal_width": "auto", +} _TEST_TRAINING_TARGET_COLUMN = "target" _TEST_TRAINING_BUDGET_MILLI_NODE_HOURS = 1000 _TEST_TRAINING_WEIGHT_COLUMN = "weight" diff --git a/tests/unit/aiplatform/test_datasets.py b/tests/unit/aiplatform/test_datasets.py index 4c2a75c393..7bb2215491 100644 --- a/tests/unit/aiplatform/test_datasets.py +++ b/tests/unit/aiplatform/test_datasets.py @@ -1005,6 +1005,44 @@ def test_tabular_dataset_column_name_bigquery(self): assert my_dataset.column_names == ["column_1", "column_2"] + @pytest.mark.usefixtures( + "get_dataset_tabular_gcs_mock", "gcs_client_download_as_bytes_mock" + ) + def test_tabular_dataset_column_name_gcs(self): + my_dataset = datasets.TabularDataset(dataset_name=_TEST_NAME) + + with pytest.raises(ValueError): + my_dataset.auto_column_specs("column_3") + + @pytest.mark.usefixtures( + "get_dataset_tabular_gcs_mock", "gcs_client_download_as_bytes_mock" + ) + def test_tabular_dataset_column_name_gcs(self): + my_dataset = datasets.TabularDataset(dataset_name=_TEST_NAME) + + assert my_dataset.auto_column_specs("column_2") == {"column_1": "auto"} + + @pytest.mark.usefixtures( + "get_dataset_tabular_bq_mock", + "bigquery_client_mock", + "bigquery_table_schema_mock", + ) + def test_tabular_dataset_column_name_bigquery(self): + my_dataset = datasets.TabularDataset(dataset_name=_TEST_NAME) + + with pytest.raises(ValueError): + my_dataset.auto_column_specs("column_3") + + @pytest.mark.usefixtures( + "get_dataset_tabular_bq_mock", + "bigquery_client_mock", + "bigquery_table_schema_mock", + ) + def test_tabular_dataset_column_name_bigquery(self): + my_dataset = datasets.TabularDataset(dataset_name=_TEST_NAME) + + assert my_dataset.auto_column_specs("column_2") == {"column_1": "auto"} + class TestTextDataset: def setup_method(self): From 619c04aee639858dd1586156a061748971de393d Mon Sep 17 00:00:00 2001 From: sirtorry Date: Tue, 15 Jun 2021 01:31:25 -0400 Subject: [PATCH 03/22] address feeback --- google/cloud/aiplatform/column.py | 1 + .../aiplatform/datasets/tabular_dataset.py | 25 +----------- .../datasets/time_series_dataset.py | 2 +- google/cloud/aiplatform/training_jobs.py | 33 ++++++++++++++-- .../test_automl_tabular_training_jobs.py | 19 +++++++++- tests/unit/aiplatform/test_datasets.py | 38 ------------------- 6 files changed, 50 insertions(+), 68 deletions(-) diff --git a/google/cloud/aiplatform/column.py b/google/cloud/aiplatform/column.py index aa13e4d609..159b3cf415 100644 --- a/google/cloud/aiplatform/column.py +++ b/google/cloud/aiplatform/column.py @@ -17,6 +17,7 @@ """Vertex AI tabular data types""" + class data_types: AUTO = "auto" NUMERIC = "numeric" diff --git a/google/cloud/aiplatform/datasets/tabular_dataset.py b/google/cloud/aiplatform/datasets/tabular_dataset.py index d630073642..95f1b16f98 100644 --- a/google/cloud/aiplatform/datasets/tabular_dataset.py +++ b/google/cloud/aiplatform/datasets/tabular_dataset.py @@ -18,7 +18,7 @@ import csv import logging -from typing import Dict, List, Optional, Sequence, Tuple, Union +from typing import List, Optional, Sequence, Tuple, Union from google.auth import credentials as auth_credentials @@ -39,29 +39,6 @@ class TabularDataset(datasets._Dataset): schema.dataset.metadata.tabular, ) - def auto_column_specs(self, target_column: str) -> Dict[str, str]: - """Returns a dict with all non-target columns as keys and 'auto' as values. - Args: - target_column(str): - Required. Intended target column. - Returns: - Dict[str, str] - Column names as keys and 'auto' as values - - Raises: - RuntimeError: When no valid source is found. - ValueError: When target_column is not in dataset - """ - if target_column is not in self.column_names: - raise ValueError("Target column not in dataset.") - column_names = [ - column - for column in self.column_names - if column != target_column - ] - column_specs = {column: 'auto' for column in column_names} - return column_specs - @property def column_names(self) -> List[str]: """Retrieve the columns for the dataset by extracting it from the Google Cloud Storage or diff --git a/google/cloud/aiplatform/datasets/time_series_dataset.py b/google/cloud/aiplatform/datasets/time_series_dataset.py index dce0f5ec91..f9f9b865d8 100644 --- a/google/cloud/aiplatform/datasets/time_series_dataset.py +++ b/google/cloud/aiplatform/datasets/time_series_dataset.py @@ -26,7 +26,7 @@ from google.cloud.aiplatform import utils -#TODO: extend tabular dataset +# TODO: extend tabular dataset class TimeSeriesDataset(datasets._Dataset): """Managed time series dataset resource for Vertex AI""" diff --git a/google/cloud/aiplatform/training_jobs.py b/google/cloud/aiplatform/training_jobs.py index 87bfc98cd6..3179686137 100644 --- a/google/cloud/aiplatform/training_jobs.py +++ b/google/cloud/aiplatform/training_jobs.py @@ -2657,13 +2657,14 @@ def _run( column_transformations = self._column_transformations if self._column_specs is not None and column_transformations is None: column_transformations = [ - {self._column_specs[column]: {"column_name": column}} for column in self._column_specs + {self._column_specs[column]: {"column_name": column}} + for column in self._column_specs ] if column_transformations is not None: column_names = dataset.column_names for transformation in column_transformations: for data_type in transformation: - column = transformation[data_type][column_name] + column = transformation[data_type] if column not in column_names: raise ValueError(f"'{column}' is not in the dataset.") if column is target_column: @@ -2742,8 +2743,34 @@ def _add_additional_experiments(self, additional_experiments: List[str]): """ self._additional_experiments.extend(additional_experiments) + @classmethod + def get_auto_column_specs( + self, dataset: datasets.TabularDataset, target_column: str, + ) -> Dict[str, str]: + """Returns a dict with all non-target columns as keys and 'auto' as values. + Args: + dataset (datasets.TabularDataset): + Required. Intended dataset. + target_column(str): + Required. Intended target column. + Returns: + Dict[str, str] + Column names as keys and 'auto' as values + + Raises: + RuntimeError: When no valid source is found. + ValueError: When target_column is not in dataset + """ + if target_column not in dataset.column_names: + raise ValueError("Target column not in dataset.") + column_names = [ + column for column in dataset.column_names if column != target_column + ] + column_specs = {column: "auto" for column in column_names} + return column_specs + -#TODO: add tabular sugar to forecasting +# TODO: add tabular sugar to forecasting class AutoMLForecastingTrainingJob(_TrainingJob): _supported_training_schemas = (schema.training_job.definition.automl_forecasting,) diff --git a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py index 74fdc94bdf..1e6671c6e5 100644 --- a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py +++ b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py @@ -233,11 +233,20 @@ def test_run_call_pipeline_service_create( encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, ) + column_specs = training_jobs.AutoMLTabularTrainingJob.get_auto_column_specs( + dataset=mock_dataset_tabular, target_column=_TEST_TRAINING_TARGET_COLUMN, + ) + + assert column_specs == __TEST_TRAINING_COLUMN_SPECS + column_specs[ + _TEST_TRAINING_COLUMN_NAMES[0] + ] = aiplatform.column.data_types.NUMERIC + job = training_jobs.AutoMLTabularTrainingJob( display_name=_TEST_DISPLAY_NAME, optimization_objective=_TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME, optimization_prediction_type=_TEST_TRAINING_OPTIMIZATION_PREDICTION_TYPE, - column_transformations=_TEST_TRAINING_COLUMN_TRANSFORMATIONS, + column_specs=column_specs, optimization_objective_recall_value=None, optimization_objective_precision_value=None, ) @@ -315,11 +324,17 @@ def test_run_call_pipeline_if_no_model_display_name( ): aiplatform.init(project=_TEST_PROJECT, staging_bucket=_TEST_BUCKET_NAME) + column_specs = training_jobs.AutoMLTabularTrainingJob.get_auto_column_specs( + dataset=mock_dataset_tabular, target_column=_TEST_TRAINING_TARGET_COLUMN, + ) + + assert column_specs == __TEST_TRAINING_COLUMN_SPECS + job = training_jobs.AutoMLTabularTrainingJob( display_name=_TEST_DISPLAY_NAME, optimization_objective=_TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME, optimization_prediction_type=_TEST_TRAINING_OPTIMIZATION_PREDICTION_TYPE, - column_transformations=_TEST_TRAINING_COLUMN_TRANSFORMATIONS, + column_specs=column_specs, optimization_objective_recall_value=None, optimization_objective_precision_value=None, training_encryption_spec_key_name=_TEST_PIPELINE_ENCRYPTION_KEY_NAME, diff --git a/tests/unit/aiplatform/test_datasets.py b/tests/unit/aiplatform/test_datasets.py index 7bb2215491..4c2a75c393 100644 --- a/tests/unit/aiplatform/test_datasets.py +++ b/tests/unit/aiplatform/test_datasets.py @@ -1005,44 +1005,6 @@ def test_tabular_dataset_column_name_bigquery(self): assert my_dataset.column_names == ["column_1", "column_2"] - @pytest.mark.usefixtures( - "get_dataset_tabular_gcs_mock", "gcs_client_download_as_bytes_mock" - ) - def test_tabular_dataset_column_name_gcs(self): - my_dataset = datasets.TabularDataset(dataset_name=_TEST_NAME) - - with pytest.raises(ValueError): - my_dataset.auto_column_specs("column_3") - - @pytest.mark.usefixtures( - "get_dataset_tabular_gcs_mock", "gcs_client_download_as_bytes_mock" - ) - def test_tabular_dataset_column_name_gcs(self): - my_dataset = datasets.TabularDataset(dataset_name=_TEST_NAME) - - assert my_dataset.auto_column_specs("column_2") == {"column_1": "auto"} - - @pytest.mark.usefixtures( - "get_dataset_tabular_bq_mock", - "bigquery_client_mock", - "bigquery_table_schema_mock", - ) - def test_tabular_dataset_column_name_bigquery(self): - my_dataset = datasets.TabularDataset(dataset_name=_TEST_NAME) - - with pytest.raises(ValueError): - my_dataset.auto_column_specs("column_3") - - @pytest.mark.usefixtures( - "get_dataset_tabular_bq_mock", - "bigquery_client_mock", - "bigquery_table_schema_mock", - ) - def test_tabular_dataset_column_name_bigquery(self): - my_dataset = datasets.TabularDataset(dataset_name=_TEST_NAME) - - assert my_dataset.auto_column_specs("column_2") == {"column_1": "auto"} - class TestTextDataset: def setup_method(self): From 2d52e0dd77845e660087eb72eaf397faa0e77e41 Mon Sep 17 00:00:00 2001 From: Torry Yang Date: Tue, 15 Jun 2021 08:32:52 -0400 Subject: [PATCH 04/22] fix test fails --- google/cloud/aiplatform/training_jobs.py | 4 ++-- tests/unit/aiplatform/test_automl_tabular_training_jobs.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/google/cloud/aiplatform/training_jobs.py b/google/cloud/aiplatform/training_jobs.py index 3179686137..4344595db6 100644 --- a/google/cloud/aiplatform/training_jobs.py +++ b/google/cloud/aiplatform/training_jobs.py @@ -2665,9 +2665,9 @@ def _run( for transformation in column_transformations: for data_type in transformation: column = transformation[data_type] - if column not in column_names: + if column['column_name'] not in column_names: raise ValueError(f"'{column}' is not in the dataset.") - if column is target_column: + if column['column_name'] is target_column: raise ValueError("Target column is in transformations.") # auto-populate transformations if column_transformations is None: diff --git a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py index 1e6671c6e5..36f4749228 100644 --- a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py +++ b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py @@ -42,6 +42,7 @@ "sepal_length", "petal_length", "petal_width", + "target", ] _TEST_TRAINING_COLUMN_TRANSFORMATIONS = [ From 42ae3e7f4563862f7bfdcf78f07ae440993ee855 Mon Sep 17 00:00:00 2001 From: Torry Yang Date: Tue, 15 Jun 2021 15:16:36 -0400 Subject: [PATCH 05/22] fix typo --- tests/unit/aiplatform/test_automl_tabular_training_jobs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py index 36f4749228..438301bc90 100644 --- a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py +++ b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py @@ -51,7 +51,7 @@ {"auto": {"column_name": "petal_length"}}, {"auto": {"column_name": "petal_width"}}, ] -__TEST_TRAINING_COLUMN_SPECS = { +_TEST_TRAINING_COLUMN_SPECS = { "sepal_width": "auto", "sepal_length": "auto", "sepal_width": "auto", @@ -238,7 +238,7 @@ def test_run_call_pipeline_service_create( dataset=mock_dataset_tabular, target_column=_TEST_TRAINING_TARGET_COLUMN, ) - assert column_specs == __TEST_TRAINING_COLUMN_SPECS + assert column_specs == _TEST_TRAINING_COLUMN_SPECS column_specs[ _TEST_TRAINING_COLUMN_NAMES[0] ] = aiplatform.column.data_types.NUMERIC @@ -329,7 +329,7 @@ def test_run_call_pipeline_if_no_model_display_name( dataset=mock_dataset_tabular, target_column=_TEST_TRAINING_TARGET_COLUMN, ) - assert column_specs == __TEST_TRAINING_COLUMN_SPECS + assert column_specs == _TEST_TRAINING_COLUMN_SPECS job = training_jobs.AutoMLTabularTrainingJob( display_name=_TEST_DISPLAY_NAME, From 7789d4e8cf98f6cd0a4b158df9b2ccf70c3aed8b Mon Sep 17 00:00:00 2001 From: sirtorry Date: Tue, 15 Jun 2021 16:18:36 -0400 Subject: [PATCH 06/22] test bug --- tests/unit/aiplatform/test_automl_tabular_training_jobs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py index 438301bc90..2cb137b8b0 100644 --- a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py +++ b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py @@ -54,8 +54,8 @@ _TEST_TRAINING_COLUMN_SPECS = { "sepal_width": "auto", "sepal_length": "auto", - "sepal_width": "auto", - "sepal_width": "auto", + "petal_width": "auto", + "petal_length": "auto", } _TEST_TRAINING_TARGET_COLUMN = "target" _TEST_TRAINING_BUDGET_MILLI_NODE_HOURS = 1000 From 3e6c94063ff46430504e5ca1f48c0211960e695d Mon Sep 17 00:00:00 2001 From: sirtorry Date: Tue, 15 Jun 2021 16:38:47 -0400 Subject: [PATCH 07/22] add column to init --- google/cloud/aiplatform/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/google/cloud/aiplatform/__init__.py b/google/cloud/aiplatform/__init__.py index 6aa8f64161..39b88a08d9 100644 --- a/google/cloud/aiplatform/__init__.py +++ b/google/cloud/aiplatform/__init__.py @@ -45,6 +45,7 @@ AutoMLTextTrainingJob, AutoMLVideoTrainingJob, ) +from google.cloud.aiplatform.column import data_types """ Usage: From ab911095662508a5593a9f912f3e25858015084f Mon Sep 17 00:00:00 2001 From: sirtorry Date: Tue, 15 Jun 2021 17:11:19 -0400 Subject: [PATCH 08/22] modify test --- tests/unit/aiplatform/test_automl_tabular_training_jobs.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py index 2cb137b8b0..37c6e4ca0d 100644 --- a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py +++ b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py @@ -242,6 +242,10 @@ def test_run_call_pipeline_service_create( column_specs[ _TEST_TRAINING_COLUMN_NAMES[0] ] = aiplatform.column.data_types.NUMERIC + column_specs[ + _TEST_TRAINING_COLUMN_NAMES[0] + ] = aiplatform.column.data_types.AUTO + job = training_jobs.AutoMLTabularTrainingJob( display_name=_TEST_DISPLAY_NAME, From 9a09ae1b7c8e5e74c333e30b50a95f354857d91d Mon Sep 17 00:00:00 2001 From: sirtorry Date: Tue, 15 Jun 2021 19:09:15 -0400 Subject: [PATCH 09/22] lint --- google/cloud/aiplatform/__init__.py | 1 - google/cloud/aiplatform/training_jobs.py | 4 ++-- tests/unit/aiplatform/test_automl_tabular_training_jobs.py | 5 +---- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/google/cloud/aiplatform/__init__.py b/google/cloud/aiplatform/__init__.py index 39b88a08d9..6aa8f64161 100644 --- a/google/cloud/aiplatform/__init__.py +++ b/google/cloud/aiplatform/__init__.py @@ -45,7 +45,6 @@ AutoMLTextTrainingJob, AutoMLVideoTrainingJob, ) -from google.cloud.aiplatform.column import data_types """ Usage: diff --git a/google/cloud/aiplatform/training_jobs.py b/google/cloud/aiplatform/training_jobs.py index 4344595db6..3a9105aaa7 100644 --- a/google/cloud/aiplatform/training_jobs.py +++ b/google/cloud/aiplatform/training_jobs.py @@ -2665,9 +2665,9 @@ def _run( for transformation in column_transformations: for data_type in transformation: column = transformation[data_type] - if column['column_name'] not in column_names: + if column["column_name"] not in column_names: raise ValueError(f"'{column}' is not in the dataset.") - if column['column_name'] is target_column: + if column["column_name"] is target_column: raise ValueError("Target column is in transformations.") # auto-populate transformations if column_transformations is None: diff --git a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py index 37c6e4ca0d..d7016282c3 100644 --- a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py +++ b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py @@ -242,10 +242,7 @@ def test_run_call_pipeline_service_create( column_specs[ _TEST_TRAINING_COLUMN_NAMES[0] ] = aiplatform.column.data_types.NUMERIC - column_specs[ - _TEST_TRAINING_COLUMN_NAMES[0] - ] = aiplatform.column.data_types.AUTO - + column_specs[_TEST_TRAINING_COLUMN_NAMES[0]] = aiplatform.column.data_types.AUTO job = training_jobs.AutoMLTabularTrainingJob( display_name=_TEST_DISPLAY_NAME, From 9a1ce72145a5a0eb4b1fc72b7f7d394254ae2b6d Mon Sep 17 00:00:00 2001 From: sirtorry Date: Tue, 15 Jun 2021 19:52:10 -0400 Subject: [PATCH 10/22] undo column removal from init --- google/cloud/aiplatform/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/google/cloud/aiplatform/__init__.py b/google/cloud/aiplatform/__init__.py index 6aa8f64161..fcad51b35b 100644 --- a/google/cloud/aiplatform/__init__.py +++ b/google/cloud/aiplatform/__init__.py @@ -45,6 +45,7 @@ AutoMLTextTrainingJob, AutoMLVideoTrainingJob, ) +from google.cloud.aiplatform.column import data_types """ Usage: @@ -89,4 +90,5 @@ "TextDataset", "TimeSeriesDataset", "VideoDataset", + "data_types", ) From d738b55e23c5935a221ad8784662b9186746308b Mon Sep 17 00:00:00 2001 From: Torry Yang Date: Sat, 19 Jun 2021 18:24:38 -0400 Subject: [PATCH 11/22] address comments --- .../datasets/time_series_dataset.py | 2 +- google/cloud/aiplatform/training_jobs.py | 20 +++++++++---------- .../test_automl_tabular_training_jobs.py | 1 + 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/google/cloud/aiplatform/datasets/time_series_dataset.py b/google/cloud/aiplatform/datasets/time_series_dataset.py index f9f9b865d8..8070daf10f 100644 --- a/google/cloud/aiplatform/datasets/time_series_dataset.py +++ b/google/cloud/aiplatform/datasets/time_series_dataset.py @@ -47,7 +47,7 @@ def create( encryption_spec_key_name: Optional[str] = None, sync: bool = True, ) -> "TimeSeriesDataset": - """Creates a new d dataset. + """Creates a new time series dataset. Args: display_name (str): diff --git a/google/cloud/aiplatform/training_jobs.py b/google/cloud/aiplatform/training_jobs.py index 3a9105aaa7..124a70eb66 100644 --- a/google/cloud/aiplatform/training_jobs.py +++ b/google/cloud/aiplatform/training_jobs.py @@ -2300,8 +2300,8 @@ def __init__( display_name: str, optimization_prediction_type: str, optimization_objective: Optional[str] = None, - column_transformations: Optional[Union[Dict, List[Dict]]] = None, column_specs: Optional[Dict[str, str]] = None, + column_transformations: Optional[Union[Dict, List[Dict]]] = None, optimization_objective_recall_value: Optional[float] = None, optimization_objective_precision_value: Optional[float] = None, project: Optional[str] = None, @@ -2352,7 +2352,7 @@ def __init__( "minimize-rmse" (default) - Minimize root-mean-squared error (RMSE). "minimize-mae" - Minimize mean-absolute error (MAE). "minimize-rmsle" - Minimize root-mean-squared log error (RMSLE). - column_transformations (Optional[Union[Dict, List[Dict]]]): + column_specs (Optional[Dict[str, str]]): Optional. Transformations to apply to the input columns (i.e. columns other than the targetColumn). Each transformation may produce multiple result values from the column's value, and all are used for training. @@ -2361,8 +2361,7 @@ def __init__( If an input column has no transformations on it, such a column is ignored by the training, except for the targetColumn, which should have no transformations defined on. - Only one of column_transformations or column_specs should be passed. - column_specs (Optional[Dict[str, str]]): + column_transformations (Optional[Union[Dict, List[Dict]]]): Optional. Transformations to apply to the input columns (i.e. columns other than the targetColumn). Each transformation may produce multiple result values from the column's value, and all are used for training. @@ -2372,6 +2371,7 @@ def __init__( ignored by the training, except for the targetColumn, which should have no transformations defined on. Only one of column_transformations or column_specs should be passed. + Only one of column_transformations or column_specs should be passed. optimization_objective_recall_value (float): Optional. Required when maximize-precision-at-recall optimizationObjective was picked, represents the recall value at which the optimization is done. @@ -2655,12 +2655,6 @@ def _run( ) if self._column_transformations is not None: column_transformations = self._column_transformations - if self._column_specs is not None and column_transformations is None: - column_transformations = [ - {self._column_specs[column]: {"column_name": column}} - for column in self._column_specs - ] - if column_transformations is not None: column_names = dataset.column_names for transformation in column_transformations: for data_type in transformation: @@ -2669,6 +2663,11 @@ def _run( raise ValueError(f"'{column}' is not in the dataset.") if column["column_name"] is target_column: raise ValueError("Target column is in transformations.") + elif self._column_specs is not None: + column_transformations = [ + {self._column_specs[column]: {"column_name": column}} + for column in self._column_specs + ] # auto-populate transformations if column_transformations is None: _LOGGER.info( @@ -2743,7 +2742,6 @@ def _add_additional_experiments(self, additional_experiments: List[str]): """ self._additional_experiments.extend(additional_experiments) - @classmethod def get_auto_column_specs( self, dataset: datasets.TabularDataset, target_column: str, ) -> Dict[str, str]: diff --git a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py index d7016282c3..5ced76a3ae 100644 --- a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py +++ b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py @@ -242,6 +242,7 @@ def test_run_call_pipeline_service_create( column_specs[ _TEST_TRAINING_COLUMN_NAMES[0] ] = aiplatform.column.data_types.NUMERIC + assert column_specs[_TEST_TRAINING_COLUMN_NAMES[0]] is "numeric" column_specs[_TEST_TRAINING_COLUMN_NAMES[0]] = aiplatform.column.data_types.AUTO job = training_jobs.AutoMLTabularTrainingJob( From 5fef60b1b75188c73250379930d4f3cba0a9db9a Mon Sep 17 00:00:00 2001 From: Torry Yang Date: Sat, 19 Jun 2021 20:47:04 -0400 Subject: [PATCH 12/22] remove self arg --- google/cloud/aiplatform/training_jobs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/google/cloud/aiplatform/training_jobs.py b/google/cloud/aiplatform/training_jobs.py index 514b740f6b..72ad39be95 100644 --- a/google/cloud/aiplatform/training_jobs.py +++ b/google/cloud/aiplatform/training_jobs.py @@ -2971,7 +2971,7 @@ def _add_additional_experiments(self, additional_experiments: List[str]): self._additional_experiments.extend(additional_experiments) def get_auto_column_specs( - self, dataset: datasets.TabularDataset, target_column: str, + dataset: datasets.TabularDataset, target_column: str, ) -> Dict[str, str]: """Returns a dict with all non-target columns as keys and 'auto' as values. Args: From 23ac4c21534ad9e8f2f3a671baaf02add540fb08 Mon Sep 17 00:00:00 2001 From: Torry Yang Date: Sun, 20 Jun 2021 14:34:26 -0400 Subject: [PATCH 13/22] lint --- google/cloud/aiplatform/training_jobs.py | 27 ++++++---- .../test_automl_tabular_training_jobs.py | 50 ++++++++++++------- 2 files changed, 48 insertions(+), 29 deletions(-) diff --git a/google/cloud/aiplatform/training_jobs.py b/google/cloud/aiplatform/training_jobs.py index 72ad39be95..339c69c6ef 100644 --- a/google/cloud/aiplatform/training_jobs.py +++ b/google/cloud/aiplatform/training_jobs.py @@ -697,7 +697,9 @@ def _get_model(self) -> Optional[models.Model]: ) return models.Model( - fields.id, project=fields.project, location=fields.location, + fields.id, + project=fields.project, + location=fields.location, ) def _wait_callback(self): @@ -1161,12 +1163,14 @@ def _prepare_and_validate_run( model_display_name = model_display_name or self._display_name + "-model" # validates args and will raise - worker_pool_specs = worker_spec_utils._DistributedTrainingSpec.chief_worker_pool( - replica_count=replica_count, - machine_type=machine_type, - accelerator_count=accelerator_count, - accelerator_type=accelerator_type, - ).pool_specs + worker_pool_specs = ( + worker_spec_utils._DistributedTrainingSpec.chief_worker_pool( + replica_count=replica_count, + machine_type=machine_type, + accelerator_count=accelerator_count, + accelerator_type=accelerator_type, + ).pool_specs + ) managed_model = self._managed_model if model_display_name: @@ -2971,7 +2975,8 @@ def _add_additional_experiments(self, additional_experiments: List[str]): self._additional_experiments.extend(additional_experiments) def get_auto_column_specs( - dataset: datasets.TabularDataset, target_column: str, + dataset: datasets.TabularDataset, + target_column: str, ) -> Dict[str, str]: """Returns a dict with all non-target columns as keys and 'auto' as values. Args: @@ -4791,8 +4796,10 @@ def __init__( schema.training_job.definition.automl_text_classification ) - training_task_inputs_dict = training_job_inputs.AutoMlTextClassificationInputs( - multi_label=multi_label + training_task_inputs_dict = ( + training_job_inputs.AutoMlTextClassificationInputs( + multi_label=multi_label + ) ) elif prediction_type == "extraction": training_task_definition = ( diff --git a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py index 5ced76a3ae..9433bdeae7 100644 --- a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py +++ b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py @@ -78,7 +78,8 @@ "optimizationObjectivePrecisionValue": None, } _TEST_TRAINING_TASK_INPUTS = json_format.ParseDict( - _TEST_TRAINING_TASK_INPUTS_DICT, struct_pb2.Value(), + _TEST_TRAINING_TASK_INPUTS_DICT, + struct_pb2.Value(), ) _TEST_TRAINING_TASK_INPUTS_WITH_ADDITIONAL_EXPERIMENTS = json_format.ParseDict( { @@ -126,10 +127,12 @@ def mock_pipeline_service_create(): with mock.patch.object( pipeline_service_client.PipelineServiceClient, "create_training_pipeline" ) as mock_create_training_pipeline: - mock_create_training_pipeline.return_value = gca_training_pipeline.TrainingPipeline( - name=_TEST_PIPELINE_RESOURCE_NAME, - state=gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED, - model_to_upload=gca_model.Model(name=_TEST_MODEL_NAME), + mock_create_training_pipeline.return_value = ( + gca_training_pipeline.TrainingPipeline( + name=_TEST_PIPELINE_RESOURCE_NAME, + state=gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED, + model_to_upload=gca_model.Model(name=_TEST_MODEL_NAME), + ) ) yield mock_create_training_pipeline @@ -139,10 +142,12 @@ def mock_pipeline_service_get(): with mock.patch.object( pipeline_service_client.PipelineServiceClient, "get_training_pipeline" ) as mock_get_training_pipeline: - mock_get_training_pipeline.return_value = gca_training_pipeline.TrainingPipeline( - name=_TEST_PIPELINE_RESOURCE_NAME, - state=gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED, - model_to_upload=gca_model.Model(name=_TEST_MODEL_NAME), + mock_get_training_pipeline.return_value = ( + gca_training_pipeline.TrainingPipeline( + name=_TEST_PIPELINE_RESOURCE_NAME, + state=gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED, + model_to_upload=gca_model.Model(name=_TEST_MODEL_NAME), + ) ) yield mock_get_training_pipeline @@ -152,17 +157,21 @@ def mock_pipeline_service_create_and_get_with_fail(): with mock.patch.object( pipeline_service_client.PipelineServiceClient, "create_training_pipeline" ) as mock_create_training_pipeline: - mock_create_training_pipeline.return_value = gca_training_pipeline.TrainingPipeline( - name=_TEST_PIPELINE_RESOURCE_NAME, - state=gca_pipeline_state.PipelineState.PIPELINE_STATE_RUNNING, + mock_create_training_pipeline.return_value = ( + gca_training_pipeline.TrainingPipeline( + name=_TEST_PIPELINE_RESOURCE_NAME, + state=gca_pipeline_state.PipelineState.PIPELINE_STATE_RUNNING, + ) ) with mock.patch.object( pipeline_service_client.PipelineServiceClient, "get_training_pipeline" ) as mock_get_training_pipeline: - mock_get_training_pipeline.return_value = gca_training_pipeline.TrainingPipeline( - name=_TEST_PIPELINE_RESOURCE_NAME, - state=gca_pipeline_state.PipelineState.PIPELINE_STATE_FAILED, + mock_get_training_pipeline.return_value = ( + gca_training_pipeline.TrainingPipeline( + name=_TEST_PIPELINE_RESOURCE_NAME, + state=gca_pipeline_state.PipelineState.PIPELINE_STATE_FAILED, + ) ) yield mock_create_training_pipeline, mock_get_training_pipeline @@ -235,14 +244,15 @@ def test_run_call_pipeline_service_create( ) column_specs = training_jobs.AutoMLTabularTrainingJob.get_auto_column_specs( - dataset=mock_dataset_tabular, target_column=_TEST_TRAINING_TARGET_COLUMN, + dataset=mock_dataset_tabular, + target_column=_TEST_TRAINING_TARGET_COLUMN, ) assert column_specs == _TEST_TRAINING_COLUMN_SPECS column_specs[ _TEST_TRAINING_COLUMN_NAMES[0] ] = aiplatform.column.data_types.NUMERIC - assert column_specs[_TEST_TRAINING_COLUMN_NAMES[0]] is "numeric" + assert column_specs[_TEST_TRAINING_COLUMN_NAMES[0]] == "numeric" column_specs[_TEST_TRAINING_COLUMN_NAMES[0]] = aiplatform.column.data_types.AUTO job = training_jobs.AutoMLTabularTrainingJob( @@ -328,7 +338,8 @@ def test_run_call_pipeline_if_no_model_display_name( aiplatform.init(project=_TEST_PROJECT, staging_bucket=_TEST_BUCKET_NAME) column_specs = training_jobs.AutoMLTabularTrainingJob.get_auto_column_specs( - dataset=mock_dataset_tabular, target_column=_TEST_TRAINING_TARGET_COLUMN, + dataset=mock_dataset_tabular, + target_column=_TEST_TRAINING_TARGET_COLUMN, ) assert column_specs == _TEST_TRAINING_COLUMN_SPECS @@ -370,7 +381,8 @@ def test_run_call_pipeline_if_no_model_display_name( ) true_input_data_config = gca_training_pipeline.InputDataConfig( - fraction_split=true_fraction_split, dataset_id=mock_dataset_tabular.name, + fraction_split=true_fraction_split, + dataset_id=mock_dataset_tabular.name, ) true_training_pipeline = gca_training_pipeline.TrainingPipeline( From d5af1e65342e93ad1fc8012413cc68a5abbdcb33 Mon Sep 17 00:00:00 2001 From: Torry Yang Date: Thu, 8 Jul 2021 22:42:07 -0400 Subject: [PATCH 14/22] address feedback --- google/cloud/aiplatform/__init__.py | 4 +- .../cloud/aiplatform/{column.py => automl.py} | 19 ++-- google/cloud/aiplatform/training_jobs.py | 93 ++++++++++--------- .../test_automl_tabular_training_jobs.py | 54 +++++------ 4 files changed, 82 insertions(+), 88 deletions(-) rename google/cloud/aiplatform/{column.py => automl.py} (66%) diff --git a/google/cloud/aiplatform/__init__.py b/google/cloud/aiplatform/__init__.py index 78fb4dadb6..1fd1b0e8f3 100644 --- a/google/cloud/aiplatform/__init__.py +++ b/google/cloud/aiplatform/__init__.py @@ -46,7 +46,7 @@ AutoMLTextTrainingJob, AutoMLVideoTrainingJob, ) -from google.cloud.aiplatform.column import data_types +from google.cloud.aiplatform.automl import column_data_types """ Usage: @@ -92,5 +92,5 @@ "TextDataset", "TimeSeriesDataset", "VideoDataset", - "data_types", + "column_data_types", ) diff --git a/google/cloud/aiplatform/column.py b/google/cloud/aiplatform/automl.py similarity index 66% rename from google/cloud/aiplatform/column.py rename to google/cloud/aiplatform/automl.py index 159b3cf415..59e4675a5a 100644 --- a/google/cloud/aiplatform/column.py +++ b/google/cloud/aiplatform/automl.py @@ -18,12 +18,13 @@ """Vertex AI tabular data types""" -class data_types: - AUTO = "auto" - NUMERIC = "numeric" - CATEGORICAL = "categorical" - TIMESTAMP = "timestamp" - TEXT = "text" - REPEATED_NUMERIC = "repeated_numeric" - REPEATED_CATEGORICAL = "repeated_categorical" - REPEATED_TEXT = "repeated_text" +class tabular: + class column_data_types: + AUTO = "auto" + NUMERIC = "numeric" + CATEGORICAL = "categorical" + TIMESTAMP = "timestamp" + TEXT = "text" + REPEATED_NUMERIC = "repeated_numeric" + REPEATED_CATEGORICAL = "repeated_categorical" + REPEATED_TEXT = "repeated_text" diff --git a/google/cloud/aiplatform/training_jobs.py b/google/cloud/aiplatform/training_jobs.py index 339c69c6ef..c324f8af9e 100644 --- a/google/cloud/aiplatform/training_jobs.py +++ b/google/cloud/aiplatform/training_jobs.py @@ -697,9 +697,7 @@ def _get_model(self) -> Optional[models.Model]: ) return models.Model( - fields.id, - project=fields.project, - location=fields.location, + fields.id, project=fields.project, location=fields.location, ) def _wait_callback(self): @@ -1163,14 +1161,12 @@ def _prepare_and_validate_run( model_display_name = model_display_name or self._display_name + "-model" # validates args and will raise - worker_pool_specs = ( - worker_spec_utils._DistributedTrainingSpec.chief_worker_pool( - replica_count=replica_count, - machine_type=machine_type, - accelerator_count=accelerator_count, - accelerator_type=accelerator_type, - ).pool_specs - ) + worker_pool_specs = worker_spec_utils._DistributedTrainingSpec.chief_worker_pool( + replica_count=replica_count, + machine_type=machine_type, + accelerator_count=accelerator_count, + accelerator_type=accelerator_type, + ).pool_specs managed_model = self._managed_model if model_display_name: @@ -2540,6 +2536,15 @@ def __init__( ): """Constructs a AutoML Tabular Training Job. + Example usage: + + job = training_jobs.AutoMLTabularTrainingJob( + display_name="my_display_name", + optimization_prediction_type="classification", + optimization_objective="minimize-log-loss", + column_specs=my_column_specs, + ) + Args: display_name (str): Required. The user-defined name of this TrainingPipeline. @@ -2580,7 +2585,7 @@ def __init__( "minimize-rmse" (default) - Minimize root-mean-squared error (RMSE). "minimize-mae" - Minimize mean-absolute error (MAE). "minimize-rmsle" - Minimize root-mean-squared log error (RMSLE). - column_specs (Optional[Dict[str, str]]): + column_specs (Dict[str, str]): Optional. Transformations to apply to the input columns (i.e. columns other than the targetColumn). Each transformation may produce multiple result values from the column's value, and all are used for training. @@ -2589,7 +2594,8 @@ def __init__( If an input column has no transformations on it, such a column is ignored by the training, except for the targetColumn, which should have no transformations defined on. - column_transformations (Optional[Union[Dict, List[Dict]]]): + Only one of column_transformations or column_specs should be passed. + column_transformations (Union[Dict, List[Dict]]): Optional. Transformations to apply to the input columns (i.e. columns other than the targetColumn). Each transformation may produce multiple result values from the column's value, and all are used for training. @@ -2599,7 +2605,6 @@ def __init__( ignored by the training, except for the targetColumn, which should have no transformations defined on. Only one of column_transformations or column_specs should be passed. - Only one of column_transformations or column_specs should be passed. optimization_objective_recall_value (float): Optional. Required when maximize-precision-at-recall optimizationObjective was picked, represents the recall value at which the optimization is done. @@ -2652,8 +2657,16 @@ def __init__( training_encryption_spec_key_name=training_encryption_spec_key_name, model_encryption_spec_key_name=model_encryption_spec_key_name, ) - self._column_transformations = column_transformations - self._column_specs = column_specs + # user populated transformations + if self._column_transformations is not None and self._column_specs is not None: + _LOGGER.info( + "column_transformations and column_specs were both passed. column_transformations was used." + ) + if column_transformations is not None: + self._column_transformations = column_transformations + self._column_specs = None + elif column_specs is not None: + self._column_specs = column_specs self._optimization_objective = optimization_objective self._optimization_prediction_type = optimization_prediction_type self._optimization_objective_recall_value = optimization_objective_recall_value @@ -2880,28 +2893,13 @@ def _run( training_task_definition = schema.training_job.definition.automl_tabular column_transformations = None - # user populated transformations - if self._column_transformations is not None and self._column_specs is not None: - _LOGGER.info( - "column_transformations and column_specs were both passed. column_transformations was used." - ) - if self._column_transformations is not None: - column_transformations = self._column_transformations - column_names = dataset.column_names - for transformation in column_transformations: - for data_type in transformation: - column = transformation[data_type] - if column["column_name"] not in column_names: - raise ValueError(f"'{column}' is not in the dataset.") - if column["column_name"] is target_column: - raise ValueError("Target column is in transformations.") - elif self._column_specs is not None: - column_transformations = [ - {self._column_specs[column]: {"column_name": column}} - for column in self._column_specs + # convert column specs to column transformations + if self._column_specs is not None: + self._column_transformations = [ + {item[1]: {"column_name": item[0]}} for item in self._column_specs.items ] # auto-populate transformations - if column_transformations is None: + if self._column_transformations is None: _LOGGER.info( "No column transformations provided, so now retrieving columns from dataset in order to set default column transformations." ) @@ -2911,7 +2909,7 @@ def _run( for column_name in dataset.column_names if column_name != target_column ] - column_transformations = [ + self._column_transformations = [ {"auto": {"column_name": column_name}} for column_name in column_names ] @@ -2923,7 +2921,7 @@ def _run( training_task_inputs_dict = { # required inputs "targetColumn": target_column, - "transformations": column_transformations, + "transformations": self._column_transformations, "trainBudgetMilliNodeHours": budget_milli_node_hours, # optional inputs "weightColumnName": weight_column, @@ -2974,11 +2972,19 @@ def _add_additional_experiments(self, additional_experiments: List[str]): """ self._additional_experiments.extend(additional_experiments) + @staticmethod def get_auto_column_specs( - dataset: datasets.TabularDataset, - target_column: str, + dataset: datasets.TabularDataset, target_column: str, ) -> Dict[str, str]: """Returns a dict with all non-target columns as keys and 'auto' as values. + + Example usage: + + column_specs = training_jobs.AutoMLTabularTrainingJob.get_auto_column_specs( + dataset=my_dataset, + target_column="my_target_column", + ) + Args: dataset (datasets.TabularDataset): Required. Intended dataset. @@ -3001,7 +3007,6 @@ def get_auto_column_specs( return column_specs -# TODO: add tabular sugar to forecasting class AutoMLForecastingTrainingJob(_TrainingJob): _supported_training_schemas = (schema.training_job.definition.automl_forecasting,) @@ -4796,10 +4801,8 @@ def __init__( schema.training_job.definition.automl_text_classification ) - training_task_inputs_dict = ( - training_job_inputs.AutoMlTextClassificationInputs( - multi_label=multi_label - ) + training_task_inputs_dict = training_job_inputs.AutoMlTextClassificationInputs( + multi_label=multi_label ) elif prediction_type == "extraction": training_task_definition = ( diff --git a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py index 9433bdeae7..7809bbeefb 100644 --- a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py +++ b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py @@ -78,8 +78,7 @@ "optimizationObjectivePrecisionValue": None, } _TEST_TRAINING_TASK_INPUTS = json_format.ParseDict( - _TEST_TRAINING_TASK_INPUTS_DICT, - struct_pb2.Value(), + _TEST_TRAINING_TASK_INPUTS_DICT, struct_pb2.Value(), ) _TEST_TRAINING_TASK_INPUTS_WITH_ADDITIONAL_EXPERIMENTS = json_format.ParseDict( { @@ -127,12 +126,10 @@ def mock_pipeline_service_create(): with mock.patch.object( pipeline_service_client.PipelineServiceClient, "create_training_pipeline" ) as mock_create_training_pipeline: - mock_create_training_pipeline.return_value = ( - gca_training_pipeline.TrainingPipeline( - name=_TEST_PIPELINE_RESOURCE_NAME, - state=gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED, - model_to_upload=gca_model.Model(name=_TEST_MODEL_NAME), - ) + mock_create_training_pipeline.return_value = gca_training_pipeline.TrainingPipeline( + name=_TEST_PIPELINE_RESOURCE_NAME, + state=gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED, + model_to_upload=gca_model.Model(name=_TEST_MODEL_NAME), ) yield mock_create_training_pipeline @@ -142,12 +139,10 @@ def mock_pipeline_service_get(): with mock.patch.object( pipeline_service_client.PipelineServiceClient, "get_training_pipeline" ) as mock_get_training_pipeline: - mock_get_training_pipeline.return_value = ( - gca_training_pipeline.TrainingPipeline( - name=_TEST_PIPELINE_RESOURCE_NAME, - state=gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED, - model_to_upload=gca_model.Model(name=_TEST_MODEL_NAME), - ) + mock_get_training_pipeline.return_value = gca_training_pipeline.TrainingPipeline( + name=_TEST_PIPELINE_RESOURCE_NAME, + state=gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED, + model_to_upload=gca_model.Model(name=_TEST_MODEL_NAME), ) yield mock_get_training_pipeline @@ -157,21 +152,17 @@ def mock_pipeline_service_create_and_get_with_fail(): with mock.patch.object( pipeline_service_client.PipelineServiceClient, "create_training_pipeline" ) as mock_create_training_pipeline: - mock_create_training_pipeline.return_value = ( - gca_training_pipeline.TrainingPipeline( - name=_TEST_PIPELINE_RESOURCE_NAME, - state=gca_pipeline_state.PipelineState.PIPELINE_STATE_RUNNING, - ) + mock_create_training_pipeline.return_value = gca_training_pipeline.TrainingPipeline( + name=_TEST_PIPELINE_RESOURCE_NAME, + state=gca_pipeline_state.PipelineState.PIPELINE_STATE_RUNNING, ) with mock.patch.object( pipeline_service_client.PipelineServiceClient, "get_training_pipeline" ) as mock_get_training_pipeline: - mock_get_training_pipeline.return_value = ( - gca_training_pipeline.TrainingPipeline( - name=_TEST_PIPELINE_RESOURCE_NAME, - state=gca_pipeline_state.PipelineState.PIPELINE_STATE_FAILED, - ) + mock_get_training_pipeline.return_value = gca_training_pipeline.TrainingPipeline( + name=_TEST_PIPELINE_RESOURCE_NAME, + state=gca_pipeline_state.PipelineState.PIPELINE_STATE_FAILED, ) yield mock_create_training_pipeline, mock_get_training_pipeline @@ -244,16 +235,17 @@ def test_run_call_pipeline_service_create( ) column_specs = training_jobs.AutoMLTabularTrainingJob.get_auto_column_specs( - dataset=mock_dataset_tabular, - target_column=_TEST_TRAINING_TARGET_COLUMN, + dataset=mock_dataset_tabular, target_column=_TEST_TRAINING_TARGET_COLUMN, ) assert column_specs == _TEST_TRAINING_COLUMN_SPECS column_specs[ _TEST_TRAINING_COLUMN_NAMES[0] - ] = aiplatform.column.data_types.NUMERIC + ] = aiplatform.automl.tabular.column_data_types.NUMERIC assert column_specs[_TEST_TRAINING_COLUMN_NAMES[0]] == "numeric" - column_specs[_TEST_TRAINING_COLUMN_NAMES[0]] = aiplatform.column.data_types.AUTO + column_specs[ + _TEST_TRAINING_COLUMN_NAMES[0] + ] = aiplatform.tabular.column.data_types.AUTO job = training_jobs.AutoMLTabularTrainingJob( display_name=_TEST_DISPLAY_NAME, @@ -338,8 +330,7 @@ def test_run_call_pipeline_if_no_model_display_name( aiplatform.init(project=_TEST_PROJECT, staging_bucket=_TEST_BUCKET_NAME) column_specs = training_jobs.AutoMLTabularTrainingJob.get_auto_column_specs( - dataset=mock_dataset_tabular, - target_column=_TEST_TRAINING_TARGET_COLUMN, + dataset=mock_dataset_tabular, target_column=_TEST_TRAINING_TARGET_COLUMN, ) assert column_specs == _TEST_TRAINING_COLUMN_SPECS @@ -381,8 +372,7 @@ def test_run_call_pipeline_if_no_model_display_name( ) true_input_data_config = gca_training_pipeline.InputDataConfig( - fraction_split=true_fraction_split, - dataset_id=mock_dataset_tabular.name, + fraction_split=true_fraction_split, dataset_id=mock_dataset_tabular.name, ) true_training_pipeline = gca_training_pipeline.TrainingPipeline( From b9e21dcfb878f5af24681d326fda126444665b9e Mon Sep 17 00:00:00 2001 From: Torry Yang Date: Fri, 9 Jul 2021 20:55:09 -0400 Subject: [PATCH 15/22] add tests --- google/cloud/aiplatform/__init__.py | 2 - google/cloud/aiplatform/automl.py | 30 -- google/cloud/aiplatform/training_jobs.py | 13 +- .../test_automl_tabular_training_jobs.py | 329 ++++++++++++++++-- 4 files changed, 315 insertions(+), 59 deletions(-) delete mode 100644 google/cloud/aiplatform/automl.py diff --git a/google/cloud/aiplatform/__init__.py b/google/cloud/aiplatform/__init__.py index 1fd1b0e8f3..ddf9c4e6e7 100644 --- a/google/cloud/aiplatform/__init__.py +++ b/google/cloud/aiplatform/__init__.py @@ -46,7 +46,6 @@ AutoMLTextTrainingJob, AutoMLVideoTrainingJob, ) -from google.cloud.aiplatform.automl import column_data_types """ Usage: @@ -92,5 +91,4 @@ "TextDataset", "TimeSeriesDataset", "VideoDataset", - "column_data_types", ) diff --git a/google/cloud/aiplatform/automl.py b/google/cloud/aiplatform/automl.py deleted file mode 100644 index 59e4675a5a..0000000000 --- a/google/cloud/aiplatform/automl.py +++ /dev/null @@ -1,30 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -"""Vertex AI tabular data types""" - - -class tabular: - class column_data_types: - AUTO = "auto" - NUMERIC = "numeric" - CATEGORICAL = "categorical" - TIMESTAMP = "timestamp" - TEXT = "text" - REPEATED_NUMERIC = "repeated_numeric" - REPEATED_CATEGORICAL = "repeated_categorical" - REPEATED_TEXT = "repeated_text" diff --git a/google/cloud/aiplatform/training_jobs.py b/google/cloud/aiplatform/training_jobs.py index c324f8af9e..83926e4d7f 100644 --- a/google/cloud/aiplatform/training_jobs.py +++ b/google/cloud/aiplatform/training_jobs.py @@ -2891,7 +2891,6 @@ def _run( """ training_task_definition = schema.training_job.definition.automl_tabular - column_transformations = None # convert column specs to column transformations if self._column_specs is not None: @@ -2977,7 +2976,7 @@ def get_auto_column_specs( dataset: datasets.TabularDataset, target_column: str, ) -> Dict[str, str]: """Returns a dict with all non-target columns as keys and 'auto' as values. - + Example usage: column_specs = training_jobs.AutoMLTabularTrainingJob.get_auto_column_specs( @@ -3006,6 +3005,16 @@ def get_auto_column_specs( column_specs = {column: "auto" for column in column_names} return column_specs + class column_data_types: + AUTO = "auto" + NUMERIC = "numeric" + CATEGORICAL = "categorical" + TIMESTAMP = "timestamp" + TEXT = "text" + REPEATED_NUMERIC = "repeated_numeric" + REPEATED_CATEGORICAL = "repeated_categorical" + REPEATED_TEXT = "repeated_text" + class AutoMLForecastingTrainingJob(_TrainingJob): _supported_training_schemas = (schema.training_job.definition.automl_forecasting,) diff --git a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py index 7809bbeefb..1ed9d183bc 100644 --- a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py +++ b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py @@ -45,6 +45,13 @@ "target", ] +_TEST_TRAINING_COLUMN_NAMES_ALTERNATIVE = [ + "apple", + "banana", + "coconut", + "target", +] + _TEST_TRAINING_COLUMN_TRANSFORMATIONS = [ {"auto": {"column_name": "sepal_width"}}, {"auto": {"column_name": "sepal_length"}}, @@ -52,11 +59,20 @@ {"auto": {"column_name": "petal_width"}}, ] _TEST_TRAINING_COLUMN_SPECS = { - "sepal_width": "auto", - "sepal_length": "auto", - "petal_width": "auto", - "petal_length": "auto", + "apple": "auto", + "banana": "auto", + "coconut": "auto", } +_TEST_TRAINING_COLUMN_TRANSFORMATIONS_ALTERNATIVE = [ + {"auto": {"column_name": "apple"}}, + {"auto": {"column_name": "banana"}}, + {"auto": {"column_name": "coconut"}}, +] +_TEST_TRAINING_COLUMN_TRANSFORMATIONS_ALTERNATIVE_NOT_AUTO = [ + {"numeric": {"column_name": "apple"}}, + {"categorical": {"column_name": "banana"}}, + {"text": {"column_name": "coconut"}}, +] _TEST_TRAINING_TARGET_COLUMN = "target" _TEST_TRAINING_BUDGET_MILLI_NODE_HOURS = 1000 _TEST_TRAINING_WEIGHT_COLUMN = "weight" @@ -87,6 +103,20 @@ }, struct_pb2.Value(), ) +_TEST_TRAINING_TASK_INPUTS_ALTERNATIVE = json_format.ParseDict( + { + **_TEST_TRAINING_TASK_INPUTS_DICT, + "transformations": _TEST_TRAINING_COLUMN_TRANSFORMATIONS_ALTERNATIVE, + }, + struct_pb2.Value(), +) +_TEST_TRAINING_TASK_INPUTS_ALTERNATIVE_NOT_AUTO = json_format.ParseDict( + { + **_TEST_TRAINING_TASK_INPUTS_DICT, + "transformations": _TEST_TRAINING_COLUMN_TRANSFORMATIONS_ALTERNATIVE_NOT_AUTO, + }, + struct_pb2.Value(), +) _TEST_DATASET_NAME = "test-dataset-name" @@ -195,6 +225,24 @@ def mock_dataset_tabular(): yield ds +@pytest.fixture +def mock_dataset_tabular_alternative(): + ds = mock.MagicMock(datasets.TabularDataset) + ds.name = _TEST_DATASET_NAME + ds._latest_future = None + ds._exception = None + ds._gca_resource = gca_dataset.Dataset( + display_name=_TEST_DATASET_DISPLAY_NAME, + metadata_schema_uri=_TEST_METADATA_SCHEMA_URI_TABULAR, + labels={}, + name=_TEST_DATASET_NAME, + metadata={}, + ) + ds.column_names = _TEST_TRAINING_COLUMN_NAMES_ALTERNATIVE + + yield ds + + @pytest.fixture def mock_dataset_nontabular(): ds = mock.MagicMock(datasets.ImageDataset) @@ -234,24 +282,11 @@ def test_run_call_pipeline_service_create( encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, ) - column_specs = training_jobs.AutoMLTabularTrainingJob.get_auto_column_specs( - dataset=mock_dataset_tabular, target_column=_TEST_TRAINING_TARGET_COLUMN, - ) - - assert column_specs == _TEST_TRAINING_COLUMN_SPECS - column_specs[ - _TEST_TRAINING_COLUMN_NAMES[0] - ] = aiplatform.automl.tabular.column_data_types.NUMERIC - assert column_specs[_TEST_TRAINING_COLUMN_NAMES[0]] == "numeric" - column_specs[ - _TEST_TRAINING_COLUMN_NAMES[0] - ] = aiplatform.tabular.column.data_types.AUTO - job = training_jobs.AutoMLTabularTrainingJob( display_name=_TEST_DISPLAY_NAME, optimization_objective=_TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME, optimization_prediction_type=_TEST_TRAINING_OPTIMIZATION_PREDICTION_TYPE, - column_specs=column_specs, + column_transformations=_TEST_TRAINING_COLUMN_TRANSFORMATIONS, optimization_objective_recall_value=None, optimization_objective_precision_value=None, ) @@ -329,17 +364,11 @@ def test_run_call_pipeline_if_no_model_display_name( ): aiplatform.init(project=_TEST_PROJECT, staging_bucket=_TEST_BUCKET_NAME) - column_specs = training_jobs.AutoMLTabularTrainingJob.get_auto_column_specs( - dataset=mock_dataset_tabular, target_column=_TEST_TRAINING_TARGET_COLUMN, - ) - - assert column_specs == _TEST_TRAINING_COLUMN_SPECS - job = training_jobs.AutoMLTabularTrainingJob( display_name=_TEST_DISPLAY_NAME, optimization_objective=_TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME, optimization_prediction_type=_TEST_TRAINING_OPTIMIZATION_PREDICTION_TYPE, - column_specs=column_specs, + column_transformations=_TEST_TRAINING_COLUMN_TRANSFORMATIONS, optimization_objective_recall_value=None, optimization_objective_precision_value=None, training_encryption_spec_key_name=_TEST_PIPELINE_ENCRYPTION_KEY_NAME, @@ -541,12 +570,262 @@ def test_run_call_pipeline_service_create_if_set_additional_experiments( training_pipeline=true_training_pipeline, ) + @pytest.mark.parametrize("sync", [True, False]) + def test_run_call_pipeline_service_create_with_column_specs( + self, + mock_pipeline_service_create, + mock_pipeline_service_get, + mock_dataset_tabular_alternative, + mock_model_service_get, + sync, + ): + aiplatform.init(project=_TEST_PROJECT, staging_bucket=_TEST_BUCKET_NAME) + + column_specs = training_jobs.AutoMLTabularTrainingJob.get_auto_column_specs( + dataset=mock_dataset_tabular_alternative, + target_column=_TEST_TRAINING_TARGET_COLUMN, + ) + + assert column_specs == _TEST_TRAINING_COLUMN_SPECS + + job = training_jobs.AutoMLTabularTrainingJob( + display_name=_TEST_DISPLAY_NAME, + optimization_objective=_TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME, + optimization_prediction_type=_TEST_TRAINING_OPTIMIZATION_PREDICTION_TYPE, + column_specs=column_specs, + optimization_objective_recall_value=None, + optimization_objective_precision_value=None, + ) + + model_from_job = job.run( + dataset=mock_dataset_tabular_alternative, + target_column=_TEST_TRAINING_TARGET_COLUMN, + model_display_name=_TEST_MODEL_DISPLAY_NAME, + training_fraction_split=_TEST_TRAINING_FRACTION_SPLIT, + validation_fraction_split=_TEST_VALIDATION_FRACTION_SPLIT, + test_fraction_split=_TEST_TEST_FRACTION_SPLIT, + predefined_split_column_name=_TEST_PREDEFINED_SPLIT_COLUMN_NAME, + weight_column=_TEST_TRAINING_WEIGHT_COLUMN, + budget_milli_node_hours=_TEST_TRAINING_BUDGET_MILLI_NODE_HOURS, + disable_early_stopping=_TEST_TRAINING_DISABLE_EARLY_STOPPING, + sync=sync, + ) + + if not sync: + model_from_job.wait() + + true_fraction_split = gca_training_pipeline.FractionSplit( + training_fraction=_TEST_TRAINING_FRACTION_SPLIT, + validation_fraction=_TEST_VALIDATION_FRACTION_SPLIT, + test_fraction=_TEST_TEST_FRACTION_SPLIT, + ) + + true_managed_model = gca_model.Model( + display_name=_TEST_MODEL_DISPLAY_NAME, + encryption_spec=_TEST_DEFAULT_ENCRYPTION_SPEC, + ) + + true_input_data_config = gca_training_pipeline.InputDataConfig( + fraction_split=true_fraction_split, + predefined_split=gca_training_pipeline.PredefinedSplit( + key=_TEST_PREDEFINED_SPLIT_COLUMN_NAME + ), + dataset_id=mock_dataset_tabular_alternative.name, + ) + + true_training_pipeline = gca_training_pipeline.TrainingPipeline( + display_name=_TEST_DISPLAY_NAME, + training_task_definition=schema.training_job.definition.automl_tabular, + training_task_inputs=_TEST_TRAINING_TASK_INPUTS_ALTERNATIVE, + model_to_upload=true_managed_model, + input_data_config=true_input_data_config, + encryption_spec=_TEST_DEFAULT_ENCRYPTION_SPEC, + ) + + mock_pipeline_service_create.assert_called_once_with( + parent=initializer.global_config.common_location_path(), + training_pipeline=true_training_pipeline, + ) + + @pytest.mark.parametrize("sync", [True, False]) + # Should default to column_transformation when column_specs also passed + def test_run_call_pipeline_service_create_with_column_specs_and_transformations( + self, + mock_pipeline_service_create, + mock_pipeline_service_get, + mock_dataset_tabular, + mock_dataset_alternative, + mock_model_service_get, + sync, + ): + aiplatform.init( + project=_TEST_PROJECT, + staging_bucket=_TEST_BUCKET_NAME, + encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, + ) + + column_specs = training_jobs.AutoMLTabularTrainingJob.get_auto_column_specs( + dataset=mock_dataset_tabular_alternative, + target_column=_TEST_TRAINING_TARGET_COLUMN, + ) + + assert column_specs == _TEST_TRAINING_COLUMN_SPECS + + job = training_jobs.AutoMLTabularTrainingJob( + display_name=_TEST_DISPLAY_NAME, + optimization_objective=_TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME, + optimization_prediction_type=_TEST_TRAINING_OPTIMIZATION_PREDICTION_TYPE, + column_transformations=_TEST_TRAINING_COLUMN_TRANSFORMATIONS, + column_specs=column_specs, + optimization_objective_recall_value=None, + optimization_objective_precision_value=None, + ) + + model_from_job = job.run( + dataset=mock_dataset_tabular, + target_column=_TEST_TRAINING_TARGET_COLUMN, + model_display_name=_TEST_MODEL_DISPLAY_NAME, + training_fraction_split=_TEST_TRAINING_FRACTION_SPLIT, + validation_fraction_split=_TEST_VALIDATION_FRACTION_SPLIT, + test_fraction_split=_TEST_TEST_FRACTION_SPLIT, + predefined_split_column_name=_TEST_PREDEFINED_SPLIT_COLUMN_NAME, + weight_column=_TEST_TRAINING_WEIGHT_COLUMN, + budget_milli_node_hours=_TEST_TRAINING_BUDGET_MILLI_NODE_HOURS, + disable_early_stopping=_TEST_TRAINING_DISABLE_EARLY_STOPPING, + sync=sync, + ) + + if not sync: + model_from_job.wait() + + true_fraction_split = gca_training_pipeline.FractionSplit( + training_fraction=_TEST_TRAINING_FRACTION_SPLIT, + validation_fraction=_TEST_VALIDATION_FRACTION_SPLIT, + test_fraction=_TEST_TEST_FRACTION_SPLIT, + ) + + true_managed_model = gca_model.Model( + display_name=_TEST_MODEL_DISPLAY_NAME, + encryption_spec=_TEST_DEFAULT_ENCRYPTION_SPEC, + ) + + true_input_data_config = gca_training_pipeline.InputDataConfig( + fraction_split=true_fraction_split, + predefined_split=gca_training_pipeline.PredefinedSplit( + key=_TEST_PREDEFINED_SPLIT_COLUMN_NAME + ), + dataset_id=mock_dataset_tabular.name, + ) + + true_training_pipeline = gca_training_pipeline.TrainingPipeline( + display_name=_TEST_DISPLAY_NAME, + training_task_definition=schema.training_job.definition.automl_tabular, + training_task_inputs=_TEST_TRAINING_TASK_INPUTS, + model_to_upload=true_managed_model, + input_data_config=true_input_data_config, + encryption_spec=_TEST_DEFAULT_ENCRYPTION_SPEC, + ) + + mock_pipeline_service_create.assert_called_once_with( + parent=initializer.global_config.common_location_path(), + training_pipeline=true_training_pipeline, + ) + + @pytest.mark.parametrize("sync", [True, False]) + def test_run_call_pipeline_service_create_with_column_specs_not_auto( + self, + mock_pipeline_service_create, + mock_pipeline_service_get, + mock_dataset_tabular_alternative, + mock_model_service_get, + sync, + ): + aiplatform.init(project=_TEST_PROJECT, staging_bucket=_TEST_BUCKET_NAME) + + column_specs = training_jobs.AutoMLTabularTrainingJob.get_auto_column_specs( + dataset=mock_dataset_tabular_alternative, + target_column=_TEST_TRAINING_TARGET_COLUMN, + ) + column_specs[ + _TEST_TRAINING_COLUMN_NAMES_ALTERNATIVE[0] + ] = training_jobs.AutoMLTabularTrainingJob.column_data_types.NUMERIC + column_specs[ + _TEST_TRAINING_COLUMN_NAMES_ALTERNATIVE[1] + ] = training_jobs.AutoMLTabularTrainingJob.column_data_types.CATEGORICAL + column_specs[ + _TEST_TRAINING_COLUMN_NAMES_ALTERNATIVE[2] + ] = training_jobs.AutoMLTabularTrainingJob.column_data_types.TEXT + + assert ( + column_specs == _TEST_TRAINING_COLUMN_TRANSFORMATIONS_ALTERNATIVE_NOT_AUTO + ) + + job = training_jobs.AutoMLTabularTrainingJob( + display_name=_TEST_DISPLAY_NAME, + optimization_objective=_TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME, + optimization_prediction_type=_TEST_TRAINING_OPTIMIZATION_PREDICTION_TYPE, + column_specs=column_specs, + optimization_objective_recall_value=None, + optimization_objective_precision_value=None, + ) + + model_from_job = job.run( + dataset=mock_dataset_tabular_alternative, + target_column=_TEST_TRAINING_TARGET_COLUMN, + model_display_name=_TEST_MODEL_DISPLAY_NAME, + training_fraction_split=_TEST_TRAINING_FRACTION_SPLIT, + validation_fraction_split=_TEST_VALIDATION_FRACTION_SPLIT, + test_fraction_split=_TEST_TEST_FRACTION_SPLIT, + predefined_split_column_name=_TEST_PREDEFINED_SPLIT_COLUMN_NAME, + weight_column=_TEST_TRAINING_WEIGHT_COLUMN, + budget_milli_node_hours=_TEST_TRAINING_BUDGET_MILLI_NODE_HOURS, + disable_early_stopping=_TEST_TRAINING_DISABLE_EARLY_STOPPING, + sync=sync, + ) + + if not sync: + model_from_job.wait() + + true_fraction_split = gca_training_pipeline.FractionSplit( + training_fraction=_TEST_TRAINING_FRACTION_SPLIT, + validation_fraction=_TEST_VALIDATION_FRACTION_SPLIT, + test_fraction=_TEST_TEST_FRACTION_SPLIT, + ) + + true_managed_model = gca_model.Model( + display_name=_TEST_MODEL_DISPLAY_NAME, + encryption_spec=_TEST_DEFAULT_ENCRYPTION_SPEC, + ) + + true_input_data_config = gca_training_pipeline.InputDataConfig( + fraction_split=true_fraction_split, + predefined_split=gca_training_pipeline.PredefinedSplit( + key=_TEST_PREDEFINED_SPLIT_COLUMN_NAME + ), + dataset_id=mock_dataset_tabular_alternative.name, + ) + + true_training_pipeline = gca_training_pipeline.TrainingPipeline( + display_name=_TEST_DISPLAY_NAME, + training_task_definition=schema.training_job.definition.automl_tabular, + training_task_inputs=_TEST_TRAINING_TASK_INPUTS_ALTERNATIVE_NOT_AUTO, + model_to_upload=true_managed_model, + input_data_config=true_input_data_config, + encryption_spec=_TEST_DEFAULT_ENCRYPTION_SPEC, + ) + + mock_pipeline_service_create.assert_called_once_with( + parent=initializer.global_config.common_location_path(), + training_pipeline=true_training_pipeline, + ) + @pytest.mark.usefixtures( "mock_pipeline_service_create", "mock_pipeline_service_get", "mock_model_service_get", ) @pytest.mark.parametrize("sync", [True, False]) + # Also acts as a custom column_transformations test as it should not error during first call def test_run_called_twice_raises(self, mock_dataset_tabular, sync): aiplatform.init(project=_TEST_PROJECT, staging_bucket=_TEST_BUCKET_NAME) From 24d12aa958c3c64ebbf83608ea579dd3c8e662ba Mon Sep 17 00:00:00 2001 From: Torry Yang Date: Fri, 9 Jul 2021 21:28:06 -0400 Subject: [PATCH 16/22] bugs --- google/cloud/aiplatform/training_jobs.py | 2 +- tests/unit/aiplatform/test_automl_tabular_training_jobs.py | 6 +----- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/google/cloud/aiplatform/training_jobs.py b/google/cloud/aiplatform/training_jobs.py index 83926e4d7f..851b2a2b47 100644 --- a/google/cloud/aiplatform/training_jobs.py +++ b/google/cloud/aiplatform/training_jobs.py @@ -2658,7 +2658,7 @@ def __init__( model_encryption_spec_key_name=model_encryption_spec_key_name, ) # user populated transformations - if self._column_transformations is not None and self._column_specs is not None: + if column_transformations is not None and column_specs is not None: _LOGGER.info( "column_transformations and column_specs were both passed. column_transformations was used." ) diff --git a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py index 1ed9d183bc..d15b64075c 100644 --- a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py +++ b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py @@ -654,7 +654,7 @@ def test_run_call_pipeline_service_create_with_column_specs_and_transformations( mock_pipeline_service_create, mock_pipeline_service_get, mock_dataset_tabular, - mock_dataset_alternative, + mock_dataset_tabular_alternative, mock_model_service_get, sync, ): @@ -756,10 +756,6 @@ def test_run_call_pipeline_service_create_with_column_specs_not_auto( _TEST_TRAINING_COLUMN_NAMES_ALTERNATIVE[2] ] = training_jobs.AutoMLTabularTrainingJob.column_data_types.TEXT - assert ( - column_specs == _TEST_TRAINING_COLUMN_TRANSFORMATIONS_ALTERNATIVE_NOT_AUTO - ) - job = training_jobs.AutoMLTabularTrainingJob( display_name=_TEST_DISPLAY_NAME, optimization_objective=_TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME, From d26513486702c544e336e76be1a93516900300a2 Mon Sep 17 00:00:00 2001 From: Torry Yang Date: Fri, 9 Jul 2021 22:06:21 -0400 Subject: [PATCH 17/22] more bugs --- google/cloud/aiplatform/training_jobs.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/google/cloud/aiplatform/training_jobs.py b/google/cloud/aiplatform/training_jobs.py index 851b2a2b47..672715ccf5 100644 --- a/google/cloud/aiplatform/training_jobs.py +++ b/google/cloud/aiplatform/training_jobs.py @@ -2667,6 +2667,8 @@ def __init__( self._column_specs = None elif column_specs is not None: self._column_specs = column_specs + else: + self._column_specs = None self._optimization_objective = optimization_objective self._optimization_prediction_type = optimization_prediction_type self._optimization_objective_recall_value = optimization_objective_recall_value @@ -2895,7 +2897,8 @@ def _run( # convert column specs to column transformations if self._column_specs is not None: self._column_transformations = [ - {item[1]: {"column_name": item[0]}} for item in self._column_specs.items + {item[1]: {"column_name": item[0]}} + for item in self._column_specs.items() ] # auto-populate transformations if self._column_transformations is None: From 6c2d903751dbf817469a7571ba70172c6302c720 Mon Sep 17 00:00:00 2001 From: Torry Yang Date: Sat, 10 Jul 2021 00:29:18 -0400 Subject: [PATCH 18/22] fix tests --- .../cloud/aiplatform/datasets/time_series_dataset.py | 1 - google/cloud/aiplatform/training_jobs.py | 7 +++---- .../aiplatform/test_automl_tabular_training_jobs.py | 12 ++---------- 3 files changed, 5 insertions(+), 15 deletions(-) diff --git a/google/cloud/aiplatform/datasets/time_series_dataset.py b/google/cloud/aiplatform/datasets/time_series_dataset.py index 8070daf10f..1a5d62bb39 100644 --- a/google/cloud/aiplatform/datasets/time_series_dataset.py +++ b/google/cloud/aiplatform/datasets/time_series_dataset.py @@ -26,7 +26,6 @@ from google.cloud.aiplatform import utils -# TODO: extend tabular dataset class TimeSeriesDataset(datasets._Dataset): """Managed time series dataset resource for Vertex AI""" diff --git a/google/cloud/aiplatform/training_jobs.py b/google/cloud/aiplatform/training_jobs.py index 672715ccf5..fde1e829fc 100644 --- a/google/cloud/aiplatform/training_jobs.py +++ b/google/cloud/aiplatform/training_jobs.py @@ -2663,12 +2663,14 @@ def __init__( "column_transformations and column_specs were both passed. column_transformations was used." ) if column_transformations is not None: - self._column_transformations = column_transformations self._column_specs = None + self._column_transformations = column_transformations elif column_specs is not None: self._column_specs = column_specs + self._column_transformations = None else: self._column_specs = None + self._column_transformations = None self._optimization_objective = optimization_objective self._optimization_prediction_type = optimization_prediction_type self._optimization_objective_recall_value = optimization_objective_recall_value @@ -2887,9 +2889,6 @@ def _run( Returns: model: The trained Vertex AI Model resource or None if training did not produce an Vertex AI Model. - Raises: - ValueError: When column doesn't exist in dataset. - ValueError: When target column is in transformations. """ training_task_definition = schema.training_job.definition.automl_tabular diff --git a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py index d15b64075c..0fd1f57101 100644 --- a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py +++ b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py @@ -620,10 +620,7 @@ def test_run_call_pipeline_service_create_with_column_specs( test_fraction=_TEST_TEST_FRACTION_SPLIT, ) - true_managed_model = gca_model.Model( - display_name=_TEST_MODEL_DISPLAY_NAME, - encryption_spec=_TEST_DEFAULT_ENCRYPTION_SPEC, - ) + true_managed_model = gca_model.Model(display_name=_TEST_MODEL_DISPLAY_NAME) true_input_data_config = gca_training_pipeline.InputDataConfig( fraction_split=true_fraction_split, @@ -639,7 +636,6 @@ def test_run_call_pipeline_service_create_with_column_specs( training_task_inputs=_TEST_TRAINING_TASK_INPUTS_ALTERNATIVE, model_to_upload=true_managed_model, input_data_config=true_input_data_config, - encryption_spec=_TEST_DEFAULT_ENCRYPTION_SPEC, ) mock_pipeline_service_create.assert_called_once_with( @@ -788,10 +784,7 @@ def test_run_call_pipeline_service_create_with_column_specs_not_auto( test_fraction=_TEST_TEST_FRACTION_SPLIT, ) - true_managed_model = gca_model.Model( - display_name=_TEST_MODEL_DISPLAY_NAME, - encryption_spec=_TEST_DEFAULT_ENCRYPTION_SPEC, - ) + true_managed_model = gca_model.Model(display_name=_TEST_MODEL_DISPLAY_NAME) true_input_data_config = gca_training_pipeline.InputDataConfig( fraction_split=true_fraction_split, @@ -807,7 +800,6 @@ def test_run_call_pipeline_service_create_with_column_specs_not_auto( training_task_inputs=_TEST_TRAINING_TASK_INPUTS_ALTERNATIVE_NOT_AUTO, model_to_upload=true_managed_model, input_data_config=true_input_data_config, - encryption_spec=_TEST_DEFAULT_ENCRYPTION_SPEC, ) mock_pipeline_service_create.assert_called_once_with( From dd2f831f830be8be354fa504b7fa1e5da85a61a3 Mon Sep 17 00:00:00 2001 From: Torry Yang Date: Wed, 14 Jul 2021 15:56:23 -0400 Subject: [PATCH 19/22] address feedback --- google/cloud/aiplatform/training_jobs.py | 29 ++++---- .../test_automl_tabular_training_jobs.py | 69 +++---------------- 2 files changed, 24 insertions(+), 74 deletions(-) diff --git a/google/cloud/aiplatform/training_jobs.py b/google/cloud/aiplatform/training_jobs.py index fde1e829fc..c9642f1dc8 100644 --- a/google/cloud/aiplatform/training_jobs.py +++ b/google/cloud/aiplatform/training_jobs.py @@ -2542,7 +2542,7 @@ def __init__( display_name="my_display_name", optimization_prediction_type="classification", optimization_objective="minimize-log-loss", - column_specs=my_column_specs, + column_specs={"column_1": "auto", "column_2": "numeric"}, ) Args: @@ -2586,11 +2586,12 @@ def __init__( "minimize-mae" - Minimize mean-absolute error (MAE). "minimize-rmsle" - Minimize root-mean-squared log error (RMSLE). column_specs (Dict[str, str]): - Optional. Transformations to apply to the input columns (i.e. columns other - than the targetColumn). Each transformation may produce multiple - result values from the column's value, and all are used for training. + Optional. Alternative to column_transformations where the keys of the dict + are column names and their respective values are one of + AutoMLTabularTrainingJob.column_data_types. When creating transformation for BigQuery Struct column, the column - should be flattened using "." as the delimiter. + should be flattened using "." as the delimiter. Only columns with no child + should have a transformation. If an input column has no transformations on it, such a column is ignored by the training, except for the targetColumn, which should have no transformations defined on. @@ -2600,7 +2601,8 @@ def __init__( than the targetColumn). Each transformation may produce multiple result values from the column's value, and all are used for training. When creating transformation for BigQuery Struct column, the column - should be flattened using "." as the delimiter. + should be flattened using "." as the delimiter. Only columns with no child + should have a transformation. If an input column has no transformations on it, such a column is ignored by the training, except for the targetColumn, which should have no transformations defined on. @@ -2648,6 +2650,9 @@ def __init__( If set, the trained Model will be secured by this key. Overrides encryption_spec_key_name set in aiplatform.init. + + Raises: + ValueError: When both column_transforations and column_specs were passed """ super().__init__( display_name=display_name, @@ -2659,8 +2664,8 @@ def __init__( ) # user populated transformations if column_transformations is not None and column_specs is not None: - _LOGGER.info( - "column_transformations and column_specs were both passed. column_transformations was used." + raise ValueError( + "Both column_transformations and column_specs were passed. Only one is allowed." ) if column_transformations is not None: self._column_specs = None @@ -2888,7 +2893,7 @@ def _run( Returns: model: The trained Vertex AI Model resource or None if training did not - produce an Vertex AI Model. + produce a Vertex AI Model. """ training_task_definition = schema.training_job.definition.automl_tabular @@ -2994,13 +2999,7 @@ def get_auto_column_specs( Returns: Dict[str, str] Column names as keys and 'auto' as values - - Raises: - RuntimeError: When no valid source is found. - ValueError: When target_column is not in dataset """ - if target_column not in dataset.column_names: - raise ValueError("Target column not in dataset.") column_names = [ column for column in dataset.column_names if column != target_column ] diff --git a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py index 0fd1f57101..df3080522e 100644 --- a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py +++ b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py @@ -667,65 +667,16 @@ def test_run_call_pipeline_service_create_with_column_specs_and_transformations( assert column_specs == _TEST_TRAINING_COLUMN_SPECS - job = training_jobs.AutoMLTabularTrainingJob( - display_name=_TEST_DISPLAY_NAME, - optimization_objective=_TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME, - optimization_prediction_type=_TEST_TRAINING_OPTIMIZATION_PREDICTION_TYPE, - column_transformations=_TEST_TRAINING_COLUMN_TRANSFORMATIONS, - column_specs=column_specs, - optimization_objective_recall_value=None, - optimization_objective_precision_value=None, - ) - - model_from_job = job.run( - dataset=mock_dataset_tabular, - target_column=_TEST_TRAINING_TARGET_COLUMN, - model_display_name=_TEST_MODEL_DISPLAY_NAME, - training_fraction_split=_TEST_TRAINING_FRACTION_SPLIT, - validation_fraction_split=_TEST_VALIDATION_FRACTION_SPLIT, - test_fraction_split=_TEST_TEST_FRACTION_SPLIT, - predefined_split_column_name=_TEST_PREDEFINED_SPLIT_COLUMN_NAME, - weight_column=_TEST_TRAINING_WEIGHT_COLUMN, - budget_milli_node_hours=_TEST_TRAINING_BUDGET_MILLI_NODE_HOURS, - disable_early_stopping=_TEST_TRAINING_DISABLE_EARLY_STOPPING, - sync=sync, - ) - - if not sync: - model_from_job.wait() - - true_fraction_split = gca_training_pipeline.FractionSplit( - training_fraction=_TEST_TRAINING_FRACTION_SPLIT, - validation_fraction=_TEST_VALIDATION_FRACTION_SPLIT, - test_fraction=_TEST_TEST_FRACTION_SPLIT, - ) - - true_managed_model = gca_model.Model( - display_name=_TEST_MODEL_DISPLAY_NAME, - encryption_spec=_TEST_DEFAULT_ENCRYPTION_SPEC, - ) - - true_input_data_config = gca_training_pipeline.InputDataConfig( - fraction_split=true_fraction_split, - predefined_split=gca_training_pipeline.PredefinedSplit( - key=_TEST_PREDEFINED_SPLIT_COLUMN_NAME - ), - dataset_id=mock_dataset_tabular.name, - ) - - true_training_pipeline = gca_training_pipeline.TrainingPipeline( - display_name=_TEST_DISPLAY_NAME, - training_task_definition=schema.training_job.definition.automl_tabular, - training_task_inputs=_TEST_TRAINING_TASK_INPUTS, - model_to_upload=true_managed_model, - input_data_config=true_input_data_config, - encryption_spec=_TEST_DEFAULT_ENCRYPTION_SPEC, - ) - - mock_pipeline_service_create.assert_called_once_with( - parent=initializer.global_config.common_location_path(), - training_pipeline=true_training_pipeline, - ) + with pytest.raises(ValueError): + training_jobs.AutoMLTabularTrainingJob( + display_name=_TEST_DISPLAY_NAME, + optimization_objective=_TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME, + optimization_prediction_type=_TEST_TRAINING_OPTIMIZATION_PREDICTION_TYPE, + column_transformations=_TEST_TRAINING_COLUMN_TRANSFORMATIONS, + column_specs=column_specs, + optimization_objective_recall_value=None, + optimization_objective_precision_value=None, + ) @pytest.mark.parametrize("sync", [True, False]) def test_run_call_pipeline_service_create_with_column_specs_not_auto( From 8239bbd5e9a00afe027d003fd8e8b979a2e48523 Mon Sep 17 00:00:00 2001 From: Torry Yang Date: Wed, 14 Jul 2021 18:12:23 -0400 Subject: [PATCH 20/22] one more test --- .../test_automl_tabular_training_jobs.py | 45 +++++++------------ 1 file changed, 16 insertions(+), 29 deletions(-) diff --git a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py index df3080522e..eeb233ec1a 100644 --- a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py +++ b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py @@ -572,12 +572,7 @@ def test_run_call_pipeline_service_create_if_set_additional_experiments( @pytest.mark.parametrize("sync", [True, False]) def test_run_call_pipeline_service_create_with_column_specs( - self, - mock_pipeline_service_create, - mock_pipeline_service_get, - mock_dataset_tabular_alternative, - mock_model_service_get, - sync, + self, mock_pipeline_service_create, mock_dataset_tabular_alternative, sync, ): aiplatform.init(project=_TEST_PROJECT, staging_bucket=_TEST_BUCKET_NAME) @@ -644,21 +639,10 @@ def test_run_call_pipeline_service_create_with_column_specs( ) @pytest.mark.parametrize("sync", [True, False]) - # Should default to column_transformation when column_specs also passed - def test_run_call_pipeline_service_create_with_column_specs_and_transformations( - self, - mock_pipeline_service_create, - mock_pipeline_service_get, - mock_dataset_tabular, - mock_dataset_tabular_alternative, - mock_model_service_get, - sync, + def test_call_pipeline_service_create_with_column_specs_and_transformations_raises( + self, mock_dataset_tabular_alternative, sync, ): - aiplatform.init( - project=_TEST_PROJECT, - staging_bucket=_TEST_BUCKET_NAME, - encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, - ) + aiplatform.init() column_specs = training_jobs.AutoMLTabularTrainingJob.get_auto_column_specs( dataset=mock_dataset_tabular_alternative, @@ -670,22 +654,25 @@ def test_run_call_pipeline_service_create_with_column_specs_and_transformations( with pytest.raises(ValueError): training_jobs.AutoMLTabularTrainingJob( display_name=_TEST_DISPLAY_NAME, - optimization_objective=_TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME, optimization_prediction_type=_TEST_TRAINING_OPTIMIZATION_PREDICTION_TYPE, column_transformations=_TEST_TRAINING_COLUMN_TRANSFORMATIONS, column_specs=column_specs, - optimization_objective_recall_value=None, - optimization_objective_precision_value=None, + ) + + @pytest.mark.parametrize("sync", [True, False]) + def test_get_column_specs_no_target_raises( + self, mock_dataset_tabular_alternative, sync, + ): + aiplatform.init() + + with pytest.raises(TypeError): + training_jobs.AutoMLTabularTrainingJob.get_auto_column_specs( + dataset=mock_dataset_tabular_alternative ) @pytest.mark.parametrize("sync", [True, False]) def test_run_call_pipeline_service_create_with_column_specs_not_auto( - self, - mock_pipeline_service_create, - mock_pipeline_service_get, - mock_dataset_tabular_alternative, - mock_model_service_get, - sync, + self, mock_pipeline_service_create, mock_dataset_tabular_alternative, sync, ): aiplatform.init(project=_TEST_PROJECT, staging_bucket=_TEST_BUCKET_NAME) From 1fefacc8c3f07da8447e179fe2500dcbac14092b Mon Sep 17 00:00:00 2001 From: Torry Yang Date: Wed, 14 Jul 2021 19:52:10 -0400 Subject: [PATCH 21/22] fix tests --- .../test_automl_tabular_training_jobs.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py index eeb233ec1a..413566440f 100644 --- a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py +++ b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py @@ -572,7 +572,12 @@ def test_run_call_pipeline_service_create_if_set_additional_experiments( @pytest.mark.parametrize("sync", [True, False]) def test_run_call_pipeline_service_create_with_column_specs( - self, mock_pipeline_service_create, mock_dataset_tabular_alternative, sync, + self, + mock_pipeline_service_create, + mock_pipeline_service_get, + mock_dataset_tabular_alternative, + mock_model_service_get, + sync, ): aiplatform.init(project=_TEST_PROJECT, staging_bucket=_TEST_BUCKET_NAME) @@ -672,7 +677,12 @@ def test_get_column_specs_no_target_raises( @pytest.mark.parametrize("sync", [True, False]) def test_run_call_pipeline_service_create_with_column_specs_not_auto( - self, mock_pipeline_service_create, mock_dataset_tabular_alternative, sync, + self, + mock_pipeline_service_create, + mock_pipeline_service_get, + mock_dataset_tabular_alternative, + mock_model_service_get, + sync, ): aiplatform.init(project=_TEST_PROJECT, staging_bucket=_TEST_BUCKET_NAME) From 322cc6628b90a52312db0b8e46a544e07060e315 Mon Sep 17 00:00:00 2001 From: Torry Yang Date: Thu, 15 Jul 2021 14:16:50 -0400 Subject: [PATCH 22/22] address feedback --- google/cloud/aiplatform/training_jobs.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/google/cloud/aiplatform/training_jobs.py b/google/cloud/aiplatform/training_jobs.py index c9642f1dc8..1d738aa7a6 100644 --- a/google/cloud/aiplatform/training_jobs.py +++ b/google/cloud/aiplatform/training_jobs.py @@ -18,6 +18,7 @@ import datetime import time from typing import Dict, List, Optional, Sequence, Tuple, Union +import warnings import abc @@ -2607,6 +2608,7 @@ def __init__( ignored by the training, except for the targetColumn, which should have no transformations defined on. Only one of column_transformations or column_specs should be passed. + Consider using column_specs as column_transformations will be deprecated eventually. optimization_objective_recall_value (float): Optional. Required when maximize-precision-at-recall optimizationObjective was picked, represents the recall value at which the optimization is done. @@ -2668,13 +2670,19 @@ def __init__( "Both column_transformations and column_specs were passed. Only one is allowed." ) if column_transformations is not None: - self._column_specs = None self._column_transformations = column_transformations + warnings.simplefilter("always", DeprecationWarning) + warnings.warn( + "consider using column_specs instead. column_transformations will be deprecated in the future.", + DeprecationWarning, + stacklevel=2, + ) elif column_specs is not None: - self._column_specs = column_specs - self._column_transformations = None + self._column_transformations = [ + {transformation: {"column_name": column_name}} + for column_name, transformation in column_specs.items() + ] else: - self._column_specs = None self._column_transformations = None self._optimization_objective = optimization_objective self._optimization_prediction_type = optimization_prediction_type @@ -2898,12 +2906,6 @@ def _run( training_task_definition = schema.training_job.definition.automl_tabular - # convert column specs to column transformations - if self._column_specs is not None: - self._column_transformations = [ - {item[1]: {"column_name": item[0]}} - for item in self._column_specs.items() - ] # auto-populate transformations if self._column_transformations is None: _LOGGER.info(