/
automl_time_series_forecasting.py
398 lines (318 loc) · 16.1 KB
/
automl_time_series_forecasting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
# -*- coding: utf-8 -*-
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import proto # type: ignore
from google.cloud.aiplatform.v1beta1.schema.trainingjob.definition_v1beta1.types import (
export_evaluated_data_items_config as gcastd_export_evaluated_data_items_config,
)
__protobuf__ = proto.module(
package="google.cloud.aiplatform.v1beta1.schema.trainingjob.definition",
manifest={
"AutoMlForecasting",
"AutoMlForecastingInputs",
"AutoMlForecastingMetadata",
},
)
class AutoMlForecasting(proto.Message):
r"""A TrainingJob that trains and uploads an AutoML Forecasting
Model.
Attributes:
inputs (google.cloud.aiplatform.v1beta1.schema.trainingjob.definition_v1beta1.types.AutoMlForecastingInputs):
The input parameters of this TrainingJob.
metadata (google.cloud.aiplatform.v1beta1.schema.trainingjob.definition_v1beta1.types.AutoMlForecastingMetadata):
The metadata information.
"""
inputs = proto.Field(proto.MESSAGE, number=1, message="AutoMlForecastingInputs",)
metadata = proto.Field(
proto.MESSAGE, number=2, message="AutoMlForecastingMetadata",
)
class AutoMlForecastingInputs(proto.Message):
r"""
Attributes:
target_column (str):
The name of the column that the model is to
predict.
time_series_identifier_column (str):
The name of the column that identifies the
time series.
time_column (str):
The name of the column that identifies time
order in the time series.
transformations (Sequence[google.cloud.aiplatform.v1beta1.schema.trainingjob.definition_v1beta1.types.AutoMlForecastingInputs.Transformation]):
Each transformation will apply transform
function to given input column. And the result
will be used for training. When creating
transformation for BigQuery Struct column, the
column should be flattened using "." as the
delimiter.
optimization_objective (str):
Objective function the model is optimizing towards. The
training process creates a model that optimizes the value of
the objective function over the validation set.
The supported optimization objectives:
- "minimize-rmse" (default) - Minimize root-mean-squared
error (RMSE).
- "minimize-mae" - Minimize mean-absolute error (MAE).
- "minimize-rmsle" - Minimize root-mean-squared log error
(RMSLE).
- "minimize-rmspe" - Minimize root-mean-squared percentage
error (RMSPE).
- "minimize-wape-mae" - Minimize the combination of
weighted absolute percentage error (WAPE) and
mean-absolute-error (MAE).
- "minimize-quantile-loss" - Minimize the quantile loss at
the quantiles defined in ``quantiles``.
train_budget_milli_node_hours (int):
Required. The train budget of creating this
model, expressed in milli node hours i.e. 1,000
value in this field means 1 node hour.
The training cost of the model will not exceed
this budget. The final cost will be attempted to
be close to the budget, though may end up being
(even) noticeably smaller - at the backend's
discretion. This especially may happen when
further model training ceases to provide any
improvements.
If the budget is set to a value known to be
insufficient to train a model for the given
dataset, the training won't be attempted and
will error.
The train budget must be between 1,000 and
72,000 milli node hours, inclusive.
weight_column (str):
Column name that should be used as the weight
column. Higher values in this column give more
importance to the row during model training. The
column must have numeric values between 0 and
10000 inclusively; 0 means the row is ignored
for training. If weight column field is not set,
then all rows are assumed to have equal weight
of 1.
time_series_attribute_columns (Sequence[str]):
Column names that should be used as attribute
columns. The value of these columns does not
vary as a function of time. For example, store
ID or item color.
unavailable_at_forecast_columns (Sequence[str]):
Names of columns that are unavailable when a forecast is
requested. This column contains information for the given
entity (identified by the time_series_identifier_column)
that is unknown before the forecast For example, actual
weather on a given day.
available_at_forecast_columns (Sequence[str]):
Names of columns that are available and provided when a
forecast is requested. These columns contain information for
the given entity (identified by the
time_series_identifier_column column) that is known at
forecast. For example, predicted weather for a specific day.
data_granularity (google.cloud.aiplatform.v1beta1.schema.trainingjob.definition_v1beta1.types.AutoMlForecastingInputs.Granularity):
Expected difference in time granularity
between rows in the data.
forecast_horizon (int):
The amount of time into the future for which forecasted
values for the target are returned. Expressed in number of
units defined by the ``data_granularity`` field.
context_window (int):
The amount of time into the past training and prediction
data is used for model training and prediction respectively.
Expressed in number of units defined by the
``data_granularity`` field.
export_evaluated_data_items_config (google.cloud.aiplatform.v1beta1.schema.trainingjob.definition_v1beta1.types.ExportEvaluatedDataItemsConfig):
Configuration for exporting test set
predictions to a BigQuery table. If this
configuration is absent, then the export is not
performed.
quantiles (Sequence[float]):
Quantiles to use for minimize-quantile-loss
``optimization_objective``. Up to 5 quantiles are allowed of
values between 0 and 1, exclusive. Required if the value of
optimization_objective is minimize-quantile-loss. Represents
the percent quantiles to use for that objective. Quantiles
must be unique.
validation_options (str):
Validation options for the data validation component. The
available options are:
- "fail-pipeline" - default, will validate against the
validation and fail the pipeline if it fails.
- "ignore-validation" - ignore the results of the
validation and continue
additional_experiments (Sequence[str]):
Additional experiment flags for the time
series forcasting training.
"""
class Transformation(proto.Message):
r"""
Attributes:
auto (google.cloud.aiplatform.v1beta1.schema.trainingjob.definition_v1beta1.types.AutoMlForecastingInputs.Transformation.AutoTransformation):
numeric (google.cloud.aiplatform.v1beta1.schema.trainingjob.definition_v1beta1.types.AutoMlForecastingInputs.Transformation.NumericTransformation):
categorical (google.cloud.aiplatform.v1beta1.schema.trainingjob.definition_v1beta1.types.AutoMlForecastingInputs.Transformation.CategoricalTransformation):
timestamp (google.cloud.aiplatform.v1beta1.schema.trainingjob.definition_v1beta1.types.AutoMlForecastingInputs.Transformation.TimestampTransformation):
text (google.cloud.aiplatform.v1beta1.schema.trainingjob.definition_v1beta1.types.AutoMlForecastingInputs.Transformation.TextTransformation):
"""
class AutoTransformation(proto.Message):
r"""Training pipeline will infer the proper transformation based
on the statistic of dataset.
Attributes:
column_name (str):
"""
column_name = proto.Field(proto.STRING, number=1,)
class NumericTransformation(proto.Message):
r"""Training pipeline will perform following transformation functions.
- The value converted to float32.
- The z_score of the value.
- log(value+1) when the value is greater than or equal to 0.
Otherwise, this transformation is not applied and the value is
considered a missing value.
- z_score of log(value+1) when the value is greater than or equal
to 0. Otherwise, this transformation is not applied and the value
is considered a missing value.
- A boolean value that indicates whether the value is valid.
Attributes:
column_name (str):
"""
column_name = proto.Field(proto.STRING, number=1,)
class CategoricalTransformation(proto.Message):
r"""Training pipeline will perform following transformation functions.
- The categorical string as is--no change to case, punctuation,
spelling, tense, and so on.
- Convert the category name to a dictionary lookup index and
generate an embedding for each index.
- Categories that appear less than 5 times in the training dataset
are treated as the "unknown" category. The "unknown" category
gets its own special lookup index and resulting embedding.
Attributes:
column_name (str):
"""
column_name = proto.Field(proto.STRING, number=1,)
class TimestampTransformation(proto.Message):
r"""Training pipeline will perform following transformation functions.
- Apply the transformation functions for Numerical columns.
- Determine the year, month, day,and weekday. Treat each value from
the timestamp as a Categorical column.
- Invalid numerical values (for example, values that fall outside
of a typical timestamp range, or are extreme values) receive no
special treatment and are not removed.
Attributes:
column_name (str):
time_format (str):
The format in which that time field is expressed. The
time_format must either be one of:
- ``unix-seconds``
- ``unix-milliseconds``
- ``unix-microseconds``
- ``unix-nanoseconds``
(for respectively number of seconds, milliseconds,
microseconds and nanoseconds since start of the Unix epoch);
or be written in ``strftime`` syntax.
If time_format is not set, then the default format is RFC
3339 ``date-time`` format, where ``time-offset`` = ``"Z"``
(e.g. 1985-04-12T23:20:50.52Z)
"""
column_name = proto.Field(proto.STRING, number=1,)
time_format = proto.Field(proto.STRING, number=2,)
class TextTransformation(proto.Message):
r"""Training pipeline will perform following transformation functions.
- The text as is--no change to case, punctuation, spelling, tense,
and so on.
- Convert the category name to a dictionary lookup index and
generate an embedding for each index.
Attributes:
column_name (str):
"""
column_name = proto.Field(proto.STRING, number=1,)
auto = proto.Field(
proto.MESSAGE,
number=1,
oneof="transformation_detail",
message="AutoMlForecastingInputs.Transformation.AutoTransformation",
)
numeric = proto.Field(
proto.MESSAGE,
number=2,
oneof="transformation_detail",
message="AutoMlForecastingInputs.Transformation.NumericTransformation",
)
categorical = proto.Field(
proto.MESSAGE,
number=3,
oneof="transformation_detail",
message="AutoMlForecastingInputs.Transformation.CategoricalTransformation",
)
timestamp = proto.Field(
proto.MESSAGE,
number=4,
oneof="transformation_detail",
message="AutoMlForecastingInputs.Transformation.TimestampTransformation",
)
text = proto.Field(
proto.MESSAGE,
number=5,
oneof="transformation_detail",
message="AutoMlForecastingInputs.Transformation.TextTransformation",
)
class Granularity(proto.Message):
r"""A duration of time expressed in time granularity units.
Attributes:
unit (str):
The time granularity unit of this time period. The supported
units are:
- "minute"
- "hour"
- "day"
- "week"
- "month"
- "year".
quantity (int):
The number of granularity_units between data points in the
training data. If ``granularity_unit`` is ``minute``, can be
1, 5, 10, 15, or 30. For all other values of
``granularity_unit``, must be 1.
"""
unit = proto.Field(proto.STRING, number=1,)
quantity = proto.Field(proto.INT64, number=2,)
target_column = proto.Field(proto.STRING, number=1,)
time_series_identifier_column = proto.Field(proto.STRING, number=2,)
time_column = proto.Field(proto.STRING, number=3,)
transformations = proto.RepeatedField(
proto.MESSAGE, number=4, message=Transformation,
)
optimization_objective = proto.Field(proto.STRING, number=5,)
train_budget_milli_node_hours = proto.Field(proto.INT64, number=6,)
weight_column = proto.Field(proto.STRING, number=7,)
time_series_attribute_columns = proto.RepeatedField(proto.STRING, number=19,)
unavailable_at_forecast_columns = proto.RepeatedField(proto.STRING, number=20,)
available_at_forecast_columns = proto.RepeatedField(proto.STRING, number=21,)
data_granularity = proto.Field(proto.MESSAGE, number=22, message=Granularity,)
forecast_horizon = proto.Field(proto.INT64, number=23,)
context_window = proto.Field(proto.INT64, number=24,)
export_evaluated_data_items_config = proto.Field(
proto.MESSAGE,
number=15,
message=gcastd_export_evaluated_data_items_config.ExportEvaluatedDataItemsConfig,
)
quantiles = proto.RepeatedField(proto.DOUBLE, number=16,)
validation_options = proto.Field(proto.STRING, number=17,)
additional_experiments = proto.RepeatedField(proto.STRING, number=25,)
class AutoMlForecastingMetadata(proto.Message):
r"""Model metadata specific to AutoML Forecasting.
Attributes:
train_cost_milli_node_hours (int):
Output only. The actual training cost of the
model, expressed in milli node hours, i.e. 1,000
value in this field means 1 node hour.
Guaranteed to not exceed the train budget.
"""
train_cost_milli_node_hours = proto.Field(proto.INT64, number=1,)
__all__ = tuple(sorted(__protobuf__.manifest))