Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: column specs for tabular transformation #466

Merged
merged 23 commits into from Jul 15, 2021
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion google/cloud/aiplatform/datasets/time_series_dataset.py
Expand Up @@ -46,7 +46,7 @@ def create(
encryption_spec_key_name: Optional[str] = None,
sync: bool = True,
) -> "TimeSeriesDataset":
"""Creates a new tabular dataset.
"""Creates a new time series dataset.

Args:
display_name (str):
Expand Down
97 changes: 90 additions & 7 deletions google/cloud/aiplatform/training_jobs.py
Expand Up @@ -2524,6 +2524,7 @@ def __init__(
display_name: str,
optimization_prediction_type: str,
optimization_objective: Optional[str] = None,
column_specs: Optional[Dict[str, str]] = None,
sirtorry marked this conversation as resolved.
Show resolved Hide resolved
column_transformations: Optional[Union[Dict, List[Dict]]] = None,
optimization_objective_recall_value: Optional[float] = None,
optimization_objective_precision_value: Optional[float] = None,
Expand All @@ -2535,6 +2536,15 @@ def __init__(
):
"""Constructs a AutoML Tabular Training Job.

Example usage:

job = training_jobs.AutoMLTabularTrainingJob(
display_name="my_display_name",
optimization_prediction_type="classification",
optimization_objective="minimize-log-loss",
column_specs=my_column_specs,
sirtorry marked this conversation as resolved.
Show resolved Hide resolved
)

Args:
display_name (str):
Required. The user-defined name of this TrainingPipeline.
Expand Down Expand Up @@ -2575,7 +2585,7 @@ def __init__(
"minimize-rmse" (default) - Minimize root-mean-squared error (RMSE).
"minimize-mae" - Minimize mean-absolute error (MAE).
"minimize-rmsle" - Minimize root-mean-squared log error (RMSLE).
column_transformations (Optional[Union[Dict, List[Dict]]]):
column_specs (Dict[str, str]):
sirtorry marked this conversation as resolved.
Show resolved Hide resolved
Optional. Transformations to apply to the input columns (i.e. columns other
than the targetColumn). Each transformation may produce multiple
result values from the column's value, and all are used for training.
Expand All @@ -2584,6 +2594,17 @@ def __init__(
If an input column has no transformations on it, such a column is
ignored by the training, except for the targetColumn, which should have
no transformations defined on.
Only one of column_transformations or column_specs should be passed.
sirtorry marked this conversation as resolved.
Show resolved Hide resolved
column_transformations (Union[Dict, List[Dict]]):
Optional. Transformations to apply to the input columns (i.e. columns other
than the targetColumn). Each transformation may produce multiple
result values from the column's value, and all are used for training.
When creating transformation for BigQuery Struct column, the column
sirtorry marked this conversation as resolved.
Show resolved Hide resolved
should be flattened using "." as the delimiter.
If an input column has no transformations on it, such a column is
ignored by the training, except for the targetColumn, which should have
no transformations defined on.
Only one of column_transformations or column_specs should be passed.
sirtorry marked this conversation as resolved.
Show resolved Hide resolved
optimization_objective_recall_value (float):
Optional. Required when maximize-precision-at-recall optimizationObjective was
picked, represents the recall value at which the optimization is done.
Expand Down Expand Up @@ -2636,7 +2657,20 @@ def __init__(
training_encryption_spec_key_name=training_encryption_spec_key_name,
model_encryption_spec_key_name=model_encryption_spec_key_name,
)
self._column_transformations = column_transformations
# user populated transformations
if column_transformations is not None and column_specs is not None:
_LOGGER.info(
"column_transformations and column_specs were both passed. column_transformations was used."
)
sirtorry marked this conversation as resolved.
Show resolved Hide resolved
if column_transformations is not None:
self._column_specs = None
self._column_transformations = column_transformations
elif column_specs is not None:
self._column_specs = column_specs
self._column_transformations = None
else:
self._column_specs = None
self._column_transformations = None
self._optimization_objective = optimization_objective
self._optimization_prediction_type = optimization_prediction_type
self._optimization_objective_recall_value = optimization_objective_recall_value
Expand Down Expand Up @@ -2854,11 +2888,18 @@ def _run(

Returns:
model: The trained Vertex AI Model resource or None if training did not
produce a Vertex AI Model.
produce an Vertex AI Model.
sirtorry marked this conversation as resolved.
Show resolved Hide resolved
"""

training_task_definition = schema.training_job.definition.automl_tabular

# convert column specs to column transformations
if self._column_specs is not None:
sirtorry marked this conversation as resolved.
Show resolved Hide resolved
self._column_transformations = [
{item[1]: {"column_name": item[0]}}
for item in self._column_specs.items()
]
# auto-populate transformations
if self._column_transformations is None:
_LOGGER.info(
"No column transformations provided, so now retrieving columns from dataset in order to set default column transformations."
Expand All @@ -2869,21 +2910,19 @@ def _run(
for column_name in dataset.column_names
if column_name != target_column
]
column_transformations = [
self._column_transformations = [
{"auto": {"column_name": column_name}} for column_name in column_names
]

_LOGGER.info(
"The column transformation of type 'auto' was set for the following columns: %s."
% column_names
)
else:
column_transformations = self._column_transformations

training_task_inputs_dict = {
# required inputs
"targetColumn": target_column,
"transformations": column_transformations,
"transformations": self._column_transformations,
"trainBudgetMilliNodeHours": budget_milli_node_hours,
# optional inputs
"weightColumnName": weight_column,
Expand Down Expand Up @@ -2934,6 +2973,50 @@ def _add_additional_experiments(self, additional_experiments: List[str]):
"""
self._additional_experiments.extend(additional_experiments)

@staticmethod
def get_auto_column_specs(
sirtorry marked this conversation as resolved.
Show resolved Hide resolved
dataset: datasets.TabularDataset, target_column: str,
) -> Dict[str, str]:
"""Returns a dict with all non-target columns as keys and 'auto' as values.
sirtorry marked this conversation as resolved.
Show resolved Hide resolved

Example usage:

column_specs = training_jobs.AutoMLTabularTrainingJob.get_auto_column_specs(
dataset=my_dataset,
target_column="my_target_column",
)

Args:
dataset (datasets.TabularDataset):
Required. Intended dataset.
target_column(str):
Required. Intended target column.
Returns:
Dict[str, str]
Column names as keys and 'auto' as values

Raises:
RuntimeError: When no valid source is found.
sirtorry marked this conversation as resolved.
Show resolved Hide resolved
ValueError: When target_column is not in dataset
"""
if target_column not in dataset.column_names:
sirtorry marked this conversation as resolved.
Show resolved Hide resolved
raise ValueError("Target column not in dataset.")
column_names = [
column for column in dataset.column_names if column != target_column
]
column_specs = {column: "auto" for column in column_names}
return column_specs

class column_data_types:
AUTO = "auto"
NUMERIC = "numeric"
CATEGORICAL = "categorical"
TIMESTAMP = "timestamp"
TEXT = "text"
REPEATED_NUMERIC = "repeated_numeric"
REPEATED_CATEGORICAL = "repeated_categorical"
REPEATED_TEXT = "repeated_text"


class AutoMLForecastingTrainingJob(_TrainingJob):
_supported_training_schemas = (schema.training_job.definition.automl_forecasting,)
Expand Down