Skip to content

Commit

Permalink
feat: column specs for tabular transformation (#466)
Browse files Browse the repository at this point in the history
- adds column_specs as an alternative to column_transformation
- adds training_jobs.AutoMLTabularTrainingJob.get_auto_column_specs
- adds aiplatform.column
  • Loading branch information
sirtorry committed Jul 15, 2021
1 parent e8121ad commit 71d0bd4
Show file tree
Hide file tree
Showing 3 changed files with 333 additions and 8 deletions.
2 changes: 1 addition & 1 deletion google/cloud/aiplatform/datasets/time_series_dataset.py
Expand Up @@ -46,7 +46,7 @@ def create(
encryption_spec_key_name: Optional[str] = None,
sync: bool = True,
) -> "TimeSeriesDataset":
"""Creates a new tabular dataset.
"""Creates a new time series dataset.
Args:
display_name (str):
Expand Down
98 changes: 91 additions & 7 deletions google/cloud/aiplatform/training_jobs.py
Expand Up @@ -18,6 +18,7 @@
import datetime
import time
from typing import Dict, List, Optional, Sequence, Tuple, Union
import warnings

import abc

Expand Down Expand Up @@ -2525,6 +2526,7 @@ def __init__(
display_name: str,
optimization_prediction_type: str,
optimization_objective: Optional[str] = None,
column_specs: Optional[Dict[str, str]] = None,
column_transformations: Optional[Union[Dict, List[Dict]]] = None,
optimization_objective_recall_value: Optional[float] = None,
optimization_objective_precision_value: Optional[float] = None,
Expand All @@ -2536,6 +2538,15 @@ def __init__(
):
"""Constructs a AutoML Tabular Training Job.
Example usage:
job = training_jobs.AutoMLTabularTrainingJob(
display_name="my_display_name",
optimization_prediction_type="classification",
optimization_objective="minimize-log-loss",
column_specs={"column_1": "auto", "column_2": "numeric"},
)
Args:
display_name (str):
Required. The user-defined name of this TrainingPipeline.
Expand Down Expand Up @@ -2576,15 +2587,29 @@ def __init__(
"minimize-rmse" (default) - Minimize root-mean-squared error (RMSE).
"minimize-mae" - Minimize mean-absolute error (MAE).
"minimize-rmsle" - Minimize root-mean-squared log error (RMSLE).
column_transformations (Optional[Union[Dict, List[Dict]]]):
column_specs (Dict[str, str]):
Optional. Alternative to column_transformations where the keys of the dict
are column names and their respective values are one of
AutoMLTabularTrainingJob.column_data_types.
When creating transformation for BigQuery Struct column, the column
should be flattened using "." as the delimiter. Only columns with no child
should have a transformation.
If an input column has no transformations on it, such a column is
ignored by the training, except for the targetColumn, which should have
no transformations defined on.
Only one of column_transformations or column_specs should be passed.
column_transformations (Union[Dict, List[Dict]]):
Optional. Transformations to apply to the input columns (i.e. columns other
than the targetColumn). Each transformation may produce multiple
result values from the column's value, and all are used for training.
When creating transformation for BigQuery Struct column, the column
should be flattened using "." as the delimiter.
should be flattened using "." as the delimiter. Only columns with no child
should have a transformation.
If an input column has no transformations on it, such a column is
ignored by the training, except for the targetColumn, which should have
no transformations defined on.
Only one of column_transformations or column_specs should be passed.
Consider using column_specs as column_transformations will be deprecated eventually.
optimization_objective_recall_value (float):
Optional. Required when maximize-precision-at-recall optimizationObjective was
picked, represents the recall value at which the optimization is done.
Expand Down Expand Up @@ -2628,6 +2653,9 @@ def __init__(
If set, the trained Model will be secured by this key.
Overrides encryption_spec_key_name set in aiplatform.init.
Raises:
ValueError: When both column_transforations and column_specs were passed
"""
super().__init__(
display_name=display_name,
Expand All @@ -2637,7 +2665,26 @@ def __init__(
training_encryption_spec_key_name=training_encryption_spec_key_name,
model_encryption_spec_key_name=model_encryption_spec_key_name,
)
self._column_transformations = column_transformations
# user populated transformations
if column_transformations is not None and column_specs is not None:
raise ValueError(
"Both column_transformations and column_specs were passed. Only one is allowed."
)
if column_transformations is not None:
self._column_transformations = column_transformations
warnings.simplefilter("always", DeprecationWarning)
warnings.warn(
"consider using column_specs instead. column_transformations will be deprecated in the future.",
DeprecationWarning,
stacklevel=2,
)
elif column_specs is not None:
self._column_transformations = [
{transformation: {"column_name": column_name}}
for column_name, transformation in column_specs.items()
]
else:
self._column_transformations = None
self._optimization_objective = optimization_objective
self._optimization_prediction_type = optimization_prediction_type
self._optimization_objective_recall_value = optimization_objective_recall_value
Expand Down Expand Up @@ -2860,6 +2907,7 @@ def _run(

training_task_definition = schema.training_job.definition.automl_tabular

# auto-populate transformations
if self._column_transformations is None:
_LOGGER.info(
"No column transformations provided, so now retrieving columns from dataset in order to set default column transformations."
Expand All @@ -2870,21 +2918,19 @@ def _run(
for column_name in dataset.column_names
if column_name != target_column
]
column_transformations = [
self._column_transformations = [
{"auto": {"column_name": column_name}} for column_name in column_names
]

_LOGGER.info(
"The column transformation of type 'auto' was set for the following columns: %s."
% column_names
)
else:
column_transformations = self._column_transformations

training_task_inputs_dict = {
# required inputs
"targetColumn": target_column,
"transformations": column_transformations,
"transformations": self._column_transformations,
"trainBudgetMilliNodeHours": budget_milli_node_hours,
# optional inputs
"weightColumnName": weight_column,
Expand Down Expand Up @@ -2935,6 +2981,44 @@ def _add_additional_experiments(self, additional_experiments: List[str]):
"""
self._additional_experiments.extend(additional_experiments)

@staticmethod
def get_auto_column_specs(
dataset: datasets.TabularDataset, target_column: str,
) -> Dict[str, str]:
"""Returns a dict with all non-target columns as keys and 'auto' as values.
Example usage:
column_specs = training_jobs.AutoMLTabularTrainingJob.get_auto_column_specs(
dataset=my_dataset,
target_column="my_target_column",
)
Args:
dataset (datasets.TabularDataset):
Required. Intended dataset.
target_column(str):
Required. Intended target column.
Returns:
Dict[str, str]
Column names as keys and 'auto' as values
"""
column_names = [
column for column in dataset.column_names if column != target_column
]
column_specs = {column: "auto" for column in column_names}
return column_specs

class column_data_types:
AUTO = "auto"
NUMERIC = "numeric"
CATEGORICAL = "categorical"
TIMESTAMP = "timestamp"
TEXT = "text"
REPEATED_NUMERIC = "repeated_numeric"
REPEATED_CATEGORICAL = "repeated_categorical"
REPEATED_TEXT = "repeated_text"


class AutoMLForecastingTrainingJob(_TrainingJob):
_supported_training_schemas = (schema.training_job.definition.automl_forecasting,)
Expand Down

0 comments on commit 71d0bd4

Please sign in to comment.