Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Added tabular forecasting samples #128

Merged
merged 16 commits into from Dec 22, 2020
@@ -0,0 +1,90 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

def make_parent(parent: str) -> str:
parent = parent

return parent

def make_training_pipeline(
display_name: str,
dataset_id: str,
model_display_name: str,
target_column: str,
time_series_identifier_column: str,
time_column: str,
static_columns: str,
time_variant_past_only_columns: str,
time_variant_past_and_future_columns: str,
forecast_window_end: int,
) -> google.cloud.aiplatform_v1alpha1.types.training_pipeline.TrainingPipeline:
# set the columns used for training and their data types
transformations = [
{"auto": {"column_name": "date"}},
{"auto": {"column_name": "state_name"}},
{"auto": {"column_name": "county_fips_code"}},
{"auto": {"column_name": "confirmed_cases"}},
{"auto": {"column_name": "deaths"}}
]

period = {"unit": "day", "quantity": 1}

training_task_inputs_dict = {
# required inputs
"targetColumn": target_column,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Curious as to if we should show comments for each of these values in the sample.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ivanmkc do you mean comments like # display_name: YOUR_DISPLAY_NAME?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for reviewing, I was wondering if we should add comments for each param in the samples.

Seems like Yuhan is suggesting to just tell them to read the docstrings.

"timeSeriesIdentifierColumn": time_series_identifier_column,
"timeColumn": time_column,
"transformations": transformations,
"period": period,

# Objective function the model is to be optimized towards.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe it's better to point to the doc pages instead of including the information below here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mean include a link in the comments?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

right - if the information here is available on a particular documentation page.

# The training process creates a Model that optimizes the value of the objective
# function over the validation set. The supported optimization objectives:
# "minimize-rmse" (default) - Minimize root-mean-squared error (RMSE).
# "minimize-mae" - Minimize mean-absolute error (MAE).
# "minimize-rmsle" - Minimize root-mean-squared log error (RMSLE).
# "minimize-rmspe" - Minimize root-mean-squared percentage error (RMSPE).
# "minimize-wape-mae" - Minimize the combination of weighted absolute percentage error (WAPE)
# and mean-absolute-error (MAE).
# "minimize-quantile-loss" - Minimize the quantile loss at the defined quantiles.
"optimizationObjective": "minimize-rmse",
"trainBudgetMilliNodeHours": 8000,
"staticColumns": static_columns,
"timeVariantPastOnlyColumns": time_variant_past_only_columns,
"timeVariantPastAndFutureColumns": time_variant_past_and_future_columns,
"forecastWindowEnd": forecast_window_end,
}

training_task_inputs = to_protobuf_value(training_task_inputs_dict)

training_pipeline = {
'display_name': display_name,
'training_task_definition': "gs://google-cloud-aiplatform/schema/trainingjob/definition/automl_forecasting_1.0.0.yaml",
'training_task_inputs': training_task_inputs,
'input_data_config': {
'dataset_id': dataset_id,
'fraction_split': {
'training_fraction': 0.8,
'validation_fraction': 0.1,
'test_fraction': 0.1,
}
},
'model_to_upload': {
'display_name': model_display_name
}
}

return training_pipeline

@@ -0,0 +1,20 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

def make_name(name: str) -> str:
name = name

return name

@@ -0,0 +1,5 @@
def make_parent(parent: str) -> str:
parent = parent

return parent

4 changes: 4 additions & 0 deletions .sample_configs/process_configs.yaml
Expand Up @@ -73,6 +73,7 @@ create_training_pipeline_image_classification_sample: {}
create_training_pipeline_image_object_detection_sample: {}
create_training_pipeline_sample: {}
create_training_pipeline_tabular_classification_sample: {}
create_training_pipeline_tabular_forecasting_sample: {}
create_training_pipeline_tabular_regression_sample: {}
create_training_pipeline_text_classification_sample: {}
create_training_pipeline_text_entity_extraction_sample:
Expand Down Expand Up @@ -182,6 +183,7 @@ get_model_evaluation_sample:
- model_explanation
get_model_evaluation_slice_sample: {}
get_model_evaluation_tabular_classification_sample: {}
get_model_evaluation_tabular_forecasting_sample: {}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the tabular_forcasting variant for get_model_evaluation ends up identical with any get_model_evaluation, please skip it. (and we should separately clean up the others - but perhaps i missed something here)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I got it, we want to consolidate if possible. Let me do another pass.

get_model_evaluation_tabular_regression_sample: {}
get_model_evaluation_text_classification_sample:
skip:
Expand Down Expand Up @@ -246,6 +248,7 @@ list_endpoints_sample: {}
list_hyperparameter_tuning_jobs_sample: {}
list_model_evaluation_slices_sample: {}
list_model_evaluations_sample: {}
list_model_evaluations_tabular_forecasting_sample: {}
list_models_sample: {}
list_specialist_pools_sample: {}
list_training_pipelines_sample: {}
Expand Down Expand Up @@ -288,6 +291,7 @@ predict_tabular_classification_sample:
comments:
predictions: See gs://google-cloud-aiplatform/schema/predict/prediction/tables_classification.yaml
for the format of the predictions.
predict_tabular_forecasting_sample: {}
predict_tabular_regression_sample:
api_endpoint: us-central1-prediction-aiplatform.googleapis.com
max_depth: 1
Expand Down
4 changes: 3 additions & 1 deletion .sample_configs/variants.yaml
Expand Up @@ -59,6 +59,7 @@ create_training_pipeline:
- image_classification
- image_object_detection
- tabular_classification
- tabular_forecasting
- tabular_regression
- text_classification
- text_entity_extraction
Expand Down Expand Up @@ -131,6 +132,7 @@ get_model_evaluation:
- image_classification
- image_object_detection
- tabular_classification
- tabular_forecasting
- tabular_regression
- text_classification
- text_entity_extraction
Expand Down Expand Up @@ -175,7 +177,7 @@ list_hyperparameter_tuning_jobs:
list_model_evaluation_slices:
- ''
list_model_evaluations:
- ''
- tabular_forecasting
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this would replace the list_model_evaluation sample. Please keep the "default" variant keyed with the empty string.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also - perhaps we don't need a special list_model_evaluations variant unless the call to list_model_evaluations is different in the tabular_forecasting use case.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh I see. Good catch.

list_models:
- ''
list_specialist_pools:
Expand Down
@@ -0,0 +1,99 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# [START aiplatform_create_training_pipeline_tabular_forecasting_sample]
from google.cloud import aiplatform
from google.protobuf import json_format
from google.protobuf.struct_pb2 import Value


def create_training_pipeline_tabular_forecasting_sample(
project: str,
display_name: str,
dataset_id: str,
model_display_name: str,
target_column: str,
time_series_identifier_column: str,
time_column: str,
static_columns: str,
time_variant_past_only_columns: str,
time_variant_past_and_future_columns: str,
forecast_window_end: int,
location: str = "us-central1",
api_endpoint: str = "us-central1-aiplatform.googleapis.com",
):
client_options = {"api_endpoint": api_endpoint}
# Initialize client that will be used to create and send requests.
# This client only needs to be created once, and can be reused for multiple requests.
client = aiplatform.gapic.PipelineServiceClient(client_options=client_options)
# set the columns used for training and their data types
transformations = [
{"auto": {"column_name": "date"}},
{"auto": {"column_name": "state_name"}},
{"auto": {"column_name": "county_fips_code"}},
{"auto": {"column_name": "confirmed_cases"}},
{"auto": {"column_name": "deaths"}},
]

period = {"unit": "day", "quantity": 1}

training_task_inputs_dict = {
# required inputs
"targetColumn": target_column,
"timeSeriesIdentifierColumn": time_series_identifier_column,
"timeColumn": time_column,
"transformations": transformations,
"period": period,
# Objective function the model is to be optimized towards.
# The training process creates a Model that optimizes the value of the objective
# function over the validation set. The supported optimization objectives:
# "minimize-rmse" (default) - Minimize root-mean-squared error (RMSE).
# "minimize-mae" - Minimize mean-absolute error (MAE).
# "minimize-rmsle" - Minimize root-mean-squared log error (RMSLE).
# "minimize-rmspe" - Minimize root-mean-squared percentage error (RMSPE).
# "minimize-wape-mae" - Minimize the combination of weighted absolute percentage error (WAPE)
# and mean-absolute-error (MAE).
# "minimize-quantile-loss" - Minimize the quantile loss at the defined quantiles.
"optimizationObjective": "minimize-rmse",
"trainBudgetMilliNodeHours": 8000,
"staticColumns": static_columns,
"timeVariantPastOnlyColumns": time_variant_past_only_columns,
"timeVariantPastAndFutureColumns": time_variant_past_and_future_columns,
"forecastWindowEnd": forecast_window_end,
}

training_task_inputs = json_format.ParseDict(training_task_inputs_dict, Value())

training_pipeline = {
"display_name": display_name,
"training_task_definition": "gs://google-cloud-aiplatform/schema/trainingjob/definition/automl_forecasting_1.0.0.yaml",
"training_task_inputs": training_task_inputs,
"input_data_config": {
"dataset_id": dataset_id,
"fraction_split": {
"training_fraction": 0.8,
"validation_fraction": 0.1,
"test_fraction": 0.1,
},
},
"model_to_upload": {"display_name": model_display_name},
}
parent = f"projects/{project}/locations/{location}"
response = client.create_training_pipeline(
parent=parent, training_pipeline=training_pipeline
)
print("response:", response)


# [END aiplatform_create_training_pipeline_tabular_forecasting_sample]
@@ -0,0 +1,87 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
from uuid import uuid4

from google.cloud import aiplatform
import pytest

import cancel_training_pipeline_sample
import create_training_pipeline_tabular_forecasting_sample
import delete_training_pipeline_sample
import helpers

PROJECT_ID = os.getenv("BUILD_SPECIFIC_GCLOUD_PROJECT")
DATASET_ID = "3003302817130610688" # COVID Dataset
DISPLAY_NAME = f"temp_create_training_pipeline_test_{uuid4()}"
TARGET_COLUMN = "deaths"
PREDICTION_TYPE = "forecasting"


@pytest.fixture
def shared_state():
state = {}
yield state


@pytest.fixture(scope="function", autouse=True)
def teardown(shared_state):
yield

training_pipeline_id = shared_state["training_pipeline_name"].split("/")[-1]

# Stop the training pipeline
cancel_training_pipeline_sample.cancel_training_pipeline_sample(
project=PROJECT_ID, training_pipeline_id=training_pipeline_id
)

client_options = {"api_endpoint": "us-central1-aiplatform.googleapis.com"}
pipeline_client = aiplatform.gapic.PipelineServiceClient(
client_options=client_options
)

# Waiting for training pipeline to be in CANCELLED state
helpers.wait_for_job_state(
get_job_method=pipeline_client.get_training_pipeline,
name=shared_state["training_pipeline_name"],
)

# Delete the training pipeline
delete_training_pipeline_sample.delete_training_pipeline_sample(
project=PROJECT_ID, training_pipeline_id=training_pipeline_id
)


def test_ucaip_generated_create_training_pipeline_sample(capsys, shared_state):

create_training_pipeline_tabular_forecasting_sample.create_training_pipeline_tabular_forecasting_sample(
project=PROJECT_ID,
display_name=DISPLAY_NAME,
dataset_id=DATASET_ID,
model_display_name="permanent_tabular_forecasting_model",
target_column=TARGET_COLUMN,
time_series_identifier_column="county",
time_column="date",
static_columns=["state_name"],
time_variant_past_only_columns=["deaths"],
time_variant_past_and_future_columns=["date"],
forecast_window_end=10,
)

out, _ = capsys.readouterr()
assert "response:" in out

# Save resource name of the newly created training pipeline
shared_state["training_pipeline_name"] = helpers.get_name(out)