Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Added tabular forecasting samples #128

Merged
merged 16 commits into from Dec 22, 2020
@@ -0,0 +1,80 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#


def make_parent(parent: str) -> str:
parent = parent

return parent


def make_training_pipeline(
display_name: str,
dataset_id: str,
model_display_name: str,
target_column: str,
time_series_identifier_column: str,
time_column: str,
static_columns: str,
time_variant_past_only_columns: str,
time_variant_past_and_future_columns: str,
forecast_window_end: int,
) -> google.cloud.aiplatform_v1alpha1.types.training_pipeline.TrainingPipeline:
# set the columns used for training and their data types
transformations = [
{"auto": {"column_name": "date"}},
{"auto": {"column_name": "state_name"}},
{"auto": {"column_name": "county_fips_code"}},
{"auto": {"column_name": "confirmed_cases"}},
{"auto": {"column_name": "deaths"}},
]

period = {"unit": "day", "quantity": 1}

# the inputs should be formatted according to the training_task_definition yaml file
training_task_inputs_dict = {
# required inputs
"targetColumn": target_column,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Curious as to if we should show comments for each of these values in the sample.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ivanmkc do you mean comments like # display_name: YOUR_DISPLAY_NAME?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for reviewing, I was wondering if we should add comments for each param in the samples.

Seems like Yuhan is suggesting to just tell them to read the docstrings.

"timeSeriesIdentifierColumn": time_series_identifier_column,
"timeColumn": time_column,
"transformations": transformations,
"period": period,
"optimizationObjective": "minimize-rmse",
"trainBudgetMilliNodeHours": 8000,
"staticColumns": static_columns,
"timeVariantPastOnlyColumns": time_variant_past_only_columns,
"timeVariantPastAndFutureColumns": time_variant_past_and_future_columns,
"forecastWindowEnd": forecast_window_end,
}

training_task_inputs = to_protobuf_value(training_task_inputs_dict)

training_pipeline = {
"display_name": display_name,
"training_task_definition": "gs://google-cloud-aiplatform/schema/trainingjob/definition/automl_forecasting_1.0.0.yaml",
"training_task_inputs": training_task_inputs,
"input_data_config": {
"dataset_id": dataset_id,
"fraction_split": {
"training_fraction": 0.8,
"validation_fraction": 0.1,
"test_fraction": 0.1,
},
},
"model_to_upload": {"display_name": model_display_name},
}

return training_pipeline

5 changes: 5 additions & 0 deletions .sample_configs/process_configs.yaml
Expand Up @@ -18,6 +18,7 @@ create_batch_prediction_job_custom_image_explain_sample: {}
create_batch_prediction_job_custom_tabular_explain_sample: {}
create_batch_prediction_job_sample: {}
create_batch_prediction_job_tabular_explain_sample: {}
create_batch_prediction_job_tabular_forecasting_sample: {}
create_batch_prediction_job_text_classification_sample: {}
create_batch_prediction_job_text_entity_extraction_sample: {}
create_batch_prediction_job_text_sentiment_analysis_sample: {}
Expand Down Expand Up @@ -73,6 +74,7 @@ create_training_pipeline_image_classification_sample: {}
create_training_pipeline_image_object_detection_sample: {}
create_training_pipeline_sample: {}
create_training_pipeline_tabular_classification_sample: {}
create_training_pipeline_tabular_forecasting_sample: {}
create_training_pipeline_tabular_regression_sample: {}
create_training_pipeline_text_classification_sample: {}
create_training_pipeline_text_entity_extraction_sample:
Expand Down Expand Up @@ -182,6 +184,7 @@ get_model_evaluation_sample:
- model_explanation
get_model_evaluation_slice_sample: {}
get_model_evaluation_tabular_classification_sample: {}
get_model_evaluation_tabular_forecasting_sample: {}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the tabular_forcasting variant for get_model_evaluation ends up identical with any get_model_evaluation, please skip it. (and we should separately clean up the others - but perhaps i missed something here)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I got it, we want to consolidate if possible. Let me do another pass.

get_model_evaluation_tabular_regression_sample: {}
get_model_evaluation_text_classification_sample:
skip:
Expand Down Expand Up @@ -246,6 +249,7 @@ list_endpoints_sample: {}
list_hyperparameter_tuning_jobs_sample: {}
list_model_evaluation_slices_sample: {}
list_model_evaluations_sample: {}
list_model_evaluations_tabular_forecasting_sample: {}
list_models_sample: {}
list_specialist_pools_sample: {}
list_training_pipelines_sample: {}
Expand Down Expand Up @@ -288,6 +292,7 @@ predict_tabular_classification_sample:
comments:
predictions: See gs://google-cloud-aiplatform/schema/predict/prediction/tables_classification.yaml
for the format of the predictions.
predict_tabular_forecasting_sample: {}
predict_tabular_regression_sample:
api_endpoint: us-central1-prediction-aiplatform.googleapis.com
max_depth: 1
Expand Down
2 changes: 2 additions & 0 deletions .sample_configs/variants.yaml
Expand Up @@ -22,6 +22,7 @@ create_batch_prediction_job:
- custom_image_explain
- custom_tabular_explain
- tabular_explain
- tabular_forecasting
- text_classification
- text_entity_extraction
- text_sentiment_analysis
Expand Down Expand Up @@ -59,6 +60,7 @@ create_training_pipeline:
- image_classification
- image_object_detection
- tabular_classification
- tabular_forecasting
- tabular_regression
- text_classification
- text_entity_extraction
Expand Down
@@ -0,0 +1,90 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# [START aiplatform_create_training_pipeline_tabular_forecasting_sample]
from google.cloud import aiplatform
from google.protobuf import json_format
from google.protobuf.struct_pb2 import Value


def create_training_pipeline_tabular_forecasting_sample(
project: str,
display_name: str,
dataset_id: str,
model_display_name: str,
target_column: str,
time_series_identifier_column: str,
time_column: str,
static_columns: str,
time_variant_past_only_columns: str,
time_variant_past_and_future_columns: str,
forecast_window_end: int,
location: str = "us-central1",
api_endpoint: str = "us-central1-aiplatform.googleapis.com",
):
client_options = {"api_endpoint": api_endpoint}
# Initialize client that will be used to create and send requests.
# This client only needs to be created once, and can be reused for multiple requests.
client = aiplatform.gapic.PipelineServiceClient(client_options=client_options)
# set the columns used for training and their data types
transformations = [
{"auto": {"column_name": "date"}},
{"auto": {"column_name": "state_name"}},
{"auto": {"column_name": "county_fips_code"}},
{"auto": {"column_name": "confirmed_cases"}},
{"auto": {"column_name": "deaths"}},
]

period = {"unit": "day", "quantity": 1}

# the inputs should be formatted according to the training_task_definition yaml file
training_task_inputs_dict = {
# required inputs
"targetColumn": target_column,
"timeSeriesIdentifierColumn": time_series_identifier_column,
"timeColumn": time_column,
"transformations": transformations,
"period": period,
"optimizationObjective": "minimize-rmse",
"trainBudgetMilliNodeHours": 8000,
"staticColumns": static_columns,
"timeVariantPastOnlyColumns": time_variant_past_only_columns,
"timeVariantPastAndFutureColumns": time_variant_past_and_future_columns,
"forecastWindowEnd": forecast_window_end,
}

training_task_inputs = json_format.ParseDict(training_task_inputs_dict, Value())

training_pipeline = {
"display_name": display_name,
"training_task_definition": "gs://google-cloud-aiplatform/schema/trainingjob/definition/automl_forecasting_1.0.0.yaml",
"training_task_inputs": training_task_inputs,
"input_data_config": {
"dataset_id": dataset_id,
"fraction_split": {
"training_fraction": 0.8,
"validation_fraction": 0.1,
"test_fraction": 0.1,
},
},
"model_to_upload": {"display_name": model_display_name},
}
parent = f"projects/{project}/locations/{location}"
response = client.create_training_pipeline(
parent=parent, training_pipeline=training_pipeline
)
print("response:", response)


# [END aiplatform_create_training_pipeline_tabular_forecasting_sample]
@@ -0,0 +1,87 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
from uuid import uuid4

from google.cloud import aiplatform
import pytest

import cancel_training_pipeline_sample
import create_training_pipeline_tabular_forecasting_sample
import delete_training_pipeline_sample
import helpers

PROJECT_ID = os.getenv("BUILD_SPECIFIC_GCLOUD_PROJECT")
DATASET_ID = "3003302817130610688" # COVID Dataset
DISPLAY_NAME = f"temp_create_training_pipeline_test_{uuid4()}"
TARGET_COLUMN = "deaths"
PREDICTION_TYPE = "forecasting"


@pytest.fixture
def shared_state():
state = {}
yield state


@pytest.fixture(scope="function", autouse=True)
def teardown(shared_state):
yield

training_pipeline_id = shared_state["training_pipeline_name"].split("/")[-1]

# Stop the training pipeline
cancel_training_pipeline_sample.cancel_training_pipeline_sample(
project=PROJECT_ID, training_pipeline_id=training_pipeline_id
)

client_options = {"api_endpoint": "us-central1-aiplatform.googleapis.com"}
pipeline_client = aiplatform.gapic.PipelineServiceClient(
client_options=client_options
)

# Waiting for training pipeline to be in CANCELLED state
helpers.wait_for_job_state(
get_job_method=pipeline_client.get_training_pipeline,
name=shared_state["training_pipeline_name"],
)

# Delete the training pipeline
delete_training_pipeline_sample.delete_training_pipeline_sample(
project=PROJECT_ID, training_pipeline_id=training_pipeline_id
)


def test_ucaip_generated_create_training_pipeline_sample(capsys, shared_state):

create_training_pipeline_tabular_forecasting_sample.create_training_pipeline_tabular_forecasting_sample(
project=PROJECT_ID,
display_name=DISPLAY_NAME,
dataset_id=DATASET_ID,
model_display_name="permanent_tabular_forecasting_model",
target_column=TARGET_COLUMN,
time_series_identifier_column="county",
time_column="date",
static_columns=["state_name"],
time_variant_past_only_columns=["deaths"],
time_variant_past_and_future_columns=["date"],
forecast_window_end=10,
)

out, _ = capsys.readouterr()
assert "response:" in out

# Save resource name of the newly created training pipeline
shared_state["training_pipeline_name"] = helpers.get_name(out)