Skip to content

Commit

Permalink
feat(bigquery): Add support for ML model export
Browse files Browse the repository at this point in the history
* Add model support to Project#extract and #extract_job
* Add ExtractJob#model?
* Add ExtractJob#ml_tf_saved_model?
* Add ExtractJob#ml_xgboost_booster?
* Add Model#extract and #extract_job

closes: #7061
pr: #7451
  • Loading branch information
quartzmo committed Sep 1, 2020
1 parent 7a834b0 commit 7626b17
Show file tree
Hide file tree
Showing 20 changed files with 1,238 additions and 215 deletions.
57 changes: 57 additions & 0 deletions google-cloud-bigquery/acceptance/bigquery/bigquery_test.rb
Expand Up @@ -54,6 +54,22 @@
t
end
let(:dataset_with_access_id) { "#{prefix}_dataset_with_access" }
let(:model_id) { "model_#{SecureRandom.hex(4)}" }
let :model_sql do
model_sql = <<~MODEL_SQL
CREATE MODEL #{dataset.dataset_id}.#{model_id}
OPTIONS (
model_type='linear_reg',
max_iteration=1,
learn_rate=0.4,
learn_rate_strategy='constant'
) AS (
SELECT 'a' AS f1, 2.0 AS label
UNION ALL
SELECT 'b' AS f1, 3.8 AS label
)
MODEL_SQL
end

before do
dataset_2
Expand Down Expand Up @@ -231,6 +247,47 @@
end
end

it "extracts a model to a GCS url with extract_job" do
model = nil
begin
query_job = dataset.query_job model_sql
query_job.wait_until_done!
_(query_job).wont_be :failed?

model = dataset.model model_id
_(model).must_be_kind_of Google::Cloud::Bigquery::Model

Tempfile.open "temp_extract_model" do |tmp|
extract_url = "gs://#{bucket.name}/#{model_id}"

# sut
extract_job = bigquery.extract_job model, extract_url

extract_job.wait_until_done!
_(extract_job).wont_be :failed?
_(extract_job.ml_tf_saved_model?).must_equal true
_(extract_job.ml_xgboost_booster?).must_equal false
_(extract_job.model?).must_equal true
_(extract_job.table?).must_equal false

source = extract_job.source
_(source).must_be_kind_of Google::Cloud::Bigquery::Model
_(source.model_id).must_equal model_id

extract_files = bucket.files prefix: model_id
_(extract_files).wont_be :nil?
_(extract_files).wont_be :empty?
extract_file = extract_files.find { |f| f.name == "#{model_id}/saved_model.pb" }
_(extract_file).wont_be :nil?
downloaded_file = extract_file.download tmp.path
_(downloaded_file.size).must_be :>, 0
end
ensure
# cleanup
model.delete if model
end
end

it "copies a readonly table to another table with copy" do
result = bigquery.copy samples_public_table, "#{dataset_id}.shakespeare_copy", create: :needed, write: :empty do |j|
j.location = "US"
Expand Down
78 changes: 75 additions & 3 deletions google-cloud-bigquery/acceptance/bigquery/model_test.rb
Expand Up @@ -41,9 +41,9 @@
end

it "can create, list, read, update, and delete a model" do
job = dataset.query_job model_sql
job.wait_until_done!
_(job).wont_be :failed?
query_job = dataset.query_job model_sql
query_job.wait_until_done!
_(query_job).wont_be :failed?

# can find the model in the list of models
_(dataset.models.all.map(&:model_id)).must_include model_id
Expand All @@ -64,4 +64,76 @@

_(dataset.model(model_id)).must_be_nil
end

it "extracts itself to a GCS url with extract" do
model = nil
begin
query_job = dataset.query_job model_sql
query_job.wait_until_done!
_(query_job).wont_be :failed?

model = dataset.model model_id
_(model).must_be_kind_of Google::Cloud::Bigquery::Model

Tempfile.open "temp_extract_model" do |tmp|
extract_url = "gs://#{bucket.name}/#{model_id}"

# sut
result = model.extract extract_url
_(result).must_equal true

extract_files = bucket.files prefix: model_id
_(extract_files).wont_be :nil?
_(extract_files).wont_be :empty?
extract_file = extract_files.find { |f| f.name == "#{model_id}/saved_model.pb" }
_(extract_file).wont_be :nil?
downloaded_file = extract_file.download tmp.path
_(downloaded_file.size).must_be :>, 0
end
ensure
# cleanup
model.delete if model
end
end

it "extracts itself to a GCS url with extract_job" do
model = nil
begin
query_job = dataset.query_job model_sql
query_job.wait_until_done!
_(query_job).wont_be :failed?

model = dataset.model model_id
_(model).must_be_kind_of Google::Cloud::Bigquery::Model

Tempfile.open "temp_extract_model" do |tmp|
extract_url = "gs://#{bucket.name}/#{model_id}"

# sut
extract_job = model.extract_job extract_url

extract_job.wait_until_done!
_(extract_job).wont_be :failed?
_(extract_job.ml_tf_saved_model?).must_equal true
_(extract_job.ml_xgboost_booster?).must_equal false
_(extract_job.model?).must_equal true
_(extract_job.table?).must_equal false

source = extract_job.source
_(source).must_be_kind_of Google::Cloud::Bigquery::Model
_(source.model_id).must_equal model_id

extract_files = bucket.files prefix: model_id
_(extract_files).wont_be :nil?
_(extract_files).wont_be :empty?
extract_file = extract_files.find { |f| f.name == "#{model_id}/saved_model.pb" }
_(extract_file).wont_be :nil?
downloaded_file = extract_file.download tmp.path
_(downloaded_file.size).must_be :>, 0
end
ensure
# cleanup
model.delete if model
end
end
end
4 changes: 3 additions & 1 deletion google-cloud-bigquery/lib/google/cloud/bigquery/convert.rb
Expand Up @@ -318,7 +318,9 @@ def self.source_format format
"parquet" => "PARQUET",
"datastore" => "DATASTORE_BACKUP",
"backup" => "DATASTORE_BACKUP",
"datastore_backup" => "DATASTORE_BACKUP"
"datastore_backup" => "DATASTORE_BACKUP",
"ml_tf_saved_model" => "ML_TF_SAVED_MODEL",
"ml_xgboost_booster" => "ML_XGBOOST_BOOSTER"
}[format.to_s.downcase]
return val unless val.nil?
format
Expand Down
21 changes: 15 additions & 6 deletions google-cloud-bigquery/lib/google/cloud/bigquery/copy_job.rb
Expand Up @@ -272,12 +272,21 @@ def encryption= val
# Sets the labels to use for the job.
#
# @param [Hash] value A hash of user-provided labels associated with
# the job. You can use these to organize and group your jobs. Label
# keys and values can be no longer than 63 characters, can only
# contain lowercase letters, numeric characters, underscores and
# dashes. International characters are allowed. Label values are
# optional. Label keys must start with a letter and each label in
# the list must have a different key.
# the job. You can use these to organize and group your jobs.
#
# The labels applied to a resource must meet the following requirements:
#
# * Each resource can have multiple labels, up to a maximum of 64.
# * Each label must be a key-value pair.
# * Keys have a minimum length of 1 character and a maximum length of
# 63 characters, and cannot be empty. Values can be empty, and have
# a maximum length of 63 characters.
# * Keys and values can contain only lowercase letters, numeric characters,
# underscores, and dashes. All characters must use UTF-8 encoding, and
# international characters are allowed.
# * The key portion of a label must be unique. However, you can use the
# same key with multiple resources.
# * Keys must start with a lowercase letter or international character.
#
# @!group Attributes
def labels= value
Expand Down
63 changes: 43 additions & 20 deletions google-cloud-bigquery/lib/google/cloud/bigquery/dataset.rb
Expand Up @@ -313,12 +313,19 @@ def labels
# @param [Hash<String, String>] labels A hash containing key/value
# pairs.
#
# * Label keys and values can be no longer than 63 characters.
# * Label keys and values can contain only lowercase letters, numbers,
# underscores, hyphens, and international characters.
# * Label keys and values cannot exceed 128 bytes in size.
# * Label keys must begin with a letter.
# * Label keys must be unique within a dataset.
# The labels applied to a resource must meet the following requirements:
#
# * Each resource can have multiple labels, up to a maximum of 64.
# * Each label must be a key-value pair.
# * Keys have a minimum length of 1 character and a maximum length of
# 63 characters, and cannot be empty. Values can be empty, and have
# a maximum length of 63 characters.
# * Keys and values can contain only lowercase letters, numeric characters,
# underscores, and dashes. All characters must use UTF-8 encoding, and
# international characters are allowed.
# * The key portion of a label must be unique. However, you can use the
# same key with multiple resources.
# * Keys must start with a lowercase letter or international character.
#
# @example
# require "google/cloud/bigquery"
Expand Down Expand Up @@ -1171,13 +1178,21 @@ def routines token: nil, max: nil, filter: nil
# is 1,024 characters. If `job_id` is provided, then `prefix` will not
# be used.
# @param [Hash] labels A hash of user-provided labels associated with
# the job. You can use these to organize and group your jobs. Label
# keys and values can be no longer than 63 characters, can only
# contain lowercase letters, numeric characters, underscores and
# dashes. International characters are allowed. Label values are
# optional. Label keys must start with a letter and each label in the
# list must have a different key. See [Requirements for
# labels](https://cloud.google.com/bigquery/docs/creating-managing-labels#requirements).
# the job. You can use these to organize and group your jobs.
#
# The labels applied to a resource must meet the following requirements:
#
# * Each resource can have multiple labels, up to a maximum of 64.
# * Each label must be a key-value pair.
# * Keys have a minimum length of 1 character and a maximum length of
# 63 characters, and cannot be empty. Values can be empty, and have
# a maximum length of 63 characters.
# * Keys and values can contain only lowercase letters, numeric characters,
# underscores, and dashes. All characters must use UTF-8 encoding, and
# international characters are allowed.
# * The key portion of a label must be unique. However, you can use the
# same key with multiple resources.
# * Keys must start with a lowercase letter or international character.
# @param [Array<String>, String] udfs User-defined function resources
# used in a legacy SQL query. May be either a code resource to load from
# a Google Cloud Storage URI (`gs://bucket/path`), or an inline resource
Expand Down Expand Up @@ -1792,13 +1807,21 @@ def external url, format: nil
# is 1,024 characters. If `job_id` is provided, then `prefix` will not
# be used.
# @param [Hash] labels A hash of user-provided labels associated with
# the job. You can use these to organize and group your jobs. Label
# keys and values can be no longer than 63 characters, can only
# contain lowercase letters, numeric characters, underscores and
# dashes. International characters are allowed. Label values are
# optional. Label keys must start with a letter and each label in the
# list must have a different key. See [Requirements for
# labels](https://cloud.google.com/bigquery/docs/creating-managing-labels#requirements).
# the job. You can use these to organize and group your jobs.
#
# The labels applied to a resource must meet the following requirements:
#
# * Each resource can have multiple labels, up to a maximum of 64.
# * Each label must be a key-value pair.
# * Keys have a minimum length of 1 character and a maximum length of
# 63 characters, and cannot be empty. Values can be empty, and have
# a maximum length of 63 characters.
# * Keys and values can contain only lowercase letters, numeric characters,
# underscores, and dashes. All characters must use UTF-8 encoding, and
# international characters are allowed.
# * The key portion of a label must be unique. However, you can use the
# same key with multiple resources.
# * Keys must start with a lowercase letter or international character.
# @yield [updater] A block for setting the schema and other
# options for the destination table. The schema can be omitted if the
# destination table already exists, or if you're loading data from a
Expand Down

0 comments on commit 7626b17

Please sign in to comment.