feat(bigquery): Add support for ML model export

* Add model support to Project#extract and #extract_job * Add ExtractJob#model? * Add ExtractJob#ml_tf_saved_model? * Add ExtractJob#ml_xgboost_booster? * Add Model#extract and #extract_job closes: #7061 pr: #7451
googleapis · Sep 1, 2020 · 7626b17 · 7626b17
1 parent 7a834b0
commit 7626b17
Show file tree

Hide file tree

Showing 20 changed files with 1,238 additions and 215 deletions.
diff --git a/google-cloud-bigquery/acceptance/bigquery/bigquery_test.rb b/google-cloud-bigquery/acceptance/bigquery/bigquery_test.rb
@@ -54,6 +54,22 @@
     t
   end
   let(:dataset_with_access_id) { "#{prefix}_dataset_with_access" }
+  let(:model_id) { "model_#{SecureRandom.hex(4)}" }
+  let :model_sql do
+    model_sql = <<~MODEL_SQL
+    CREATE MODEL #{dataset.dataset_id}.#{model_id}
+    OPTIONS (
+        model_type='linear_reg',
+        max_iteration=1,
+        learn_rate=0.4,
+        learn_rate_strategy='constant'
+    ) AS (
+        SELECT 'a' AS f1, 2.0 AS label
+        UNION ALL
+        SELECT 'b' AS f1, 3.8 AS label
+    )
+    MODEL_SQL
+  end
 
   before do
     dataset_2
@@ -231,6 +247,47 @@
     end
   end
 
+  it "extracts a model to a GCS url with extract_job" do
+    model = nil
+    begin
+      query_job = dataset.query_job model_sql
+      query_job.wait_until_done!
+      _(query_job).wont_be :failed?
+
+      model = dataset.model model_id
+      _(model).must_be_kind_of Google::Cloud::Bigquery::Model
+
+      Tempfile.open "temp_extract_model" do |tmp|
+        extract_url = "gs://#{bucket.name}/#{model_id}"
+
+        # sut
+        extract_job = bigquery.extract_job model, extract_url
+
+        extract_job.wait_until_done!
+        _(extract_job).wont_be :failed?
+        _(extract_job.ml_tf_saved_model?).must_equal true
+        _(extract_job.ml_xgboost_booster?).must_equal false
+        _(extract_job.model?).must_equal true
+        _(extract_job.table?).must_equal false
+
+        source = extract_job.source
+        _(source).must_be_kind_of Google::Cloud::Bigquery::Model
+        _(source.model_id).must_equal model_id
+
+        extract_files = bucket.files prefix: model_id
+        _(extract_files).wont_be :nil?
+        _(extract_files).wont_be :empty?
+        extract_file = extract_files.find { |f| f.name == "#{model_id}/saved_model.pb" }
+        _(extract_file).wont_be :nil?
+        downloaded_file = extract_file.download tmp.path
+        _(downloaded_file.size).must_be :>, 0
+      end
+    ensure
+      # cleanup
+      model.delete if model
+    end
+  end
+
   it "copies a readonly table to another table with copy" do
     result = bigquery.copy samples_public_table, "#{dataset_id}.shakespeare_copy", create: :needed, write: :empty do |j|
       j.location = "US"

diff --git a/google-cloud-bigquery/acceptance/bigquery/model_test.rb b/google-cloud-bigquery/acceptance/bigquery/model_test.rb
@@ -41,9 +41,9 @@
   end
 
   it "can create, list, read, update, and delete a model" do
-    job = dataset.query_job model_sql
-    job.wait_until_done!
-    _(job).wont_be :failed?
+    query_job = dataset.query_job model_sql
+    query_job.wait_until_done!
+    _(query_job).wont_be :failed?
 
     # can find the model in the list of models
     _(dataset.models.all.map(&:model_id)).must_include model_id
@@ -64,4 +64,76 @@
 
     _(dataset.model(model_id)).must_be_nil
   end
+
+  it "extracts itself to a GCS url with extract" do
+    model = nil
+    begin
+      query_job = dataset.query_job model_sql
+      query_job.wait_until_done!
+      _(query_job).wont_be :failed?
+
+      model = dataset.model model_id
+      _(model).must_be_kind_of Google::Cloud::Bigquery::Model
+
+      Tempfile.open "temp_extract_model" do |tmp|
+        extract_url = "gs://#{bucket.name}/#{model_id}"
+
+        # sut
+        result = model.extract extract_url
+        _(result).must_equal true
+
+        extract_files = bucket.files prefix: model_id
+        _(extract_files).wont_be :nil?
+        _(extract_files).wont_be :empty?
+        extract_file = extract_files.find { |f| f.name == "#{model_id}/saved_model.pb" }
+        _(extract_file).wont_be :nil?
+        downloaded_file = extract_file.download tmp.path
+        _(downloaded_file.size).must_be :>, 0
+      end
+    ensure
+      # cleanup
+      model.delete if model
+    end
+  end
+
+  it "extracts itself to a GCS url with extract_job" do
+    model = nil
+    begin
+      query_job = dataset.query_job model_sql
+      query_job.wait_until_done!
+      _(query_job).wont_be :failed?
+
+      model = dataset.model model_id
+      _(model).must_be_kind_of Google::Cloud::Bigquery::Model
+
+      Tempfile.open "temp_extract_model" do |tmp|
+        extract_url = "gs://#{bucket.name}/#{model_id}"
+
+        # sut
+        extract_job = model.extract_job extract_url
+
+        extract_job.wait_until_done!
+        _(extract_job).wont_be :failed?
+        _(extract_job.ml_tf_saved_model?).must_equal true
+        _(extract_job.ml_xgboost_booster?).must_equal false
+        _(extract_job.model?).must_equal true
+        _(extract_job.table?).must_equal false
+
+        source = extract_job.source
+        _(source).must_be_kind_of Google::Cloud::Bigquery::Model
+        _(source.model_id).must_equal model_id
+
+        extract_files = bucket.files prefix: model_id
+        _(extract_files).wont_be :nil?
+        _(extract_files).wont_be :empty?
+        extract_file = extract_files.find { |f| f.name == "#{model_id}/saved_model.pb" }
+        _(extract_file).wont_be :nil?
+        downloaded_file = extract_file.download tmp.path
+        _(downloaded_file.size).must_be :>, 0
+      end
+    ensure
+      # cleanup
+      model.delete if model
+    end
+  end
 end
diff --git a/google-cloud-bigquery/lib/google/cloud/bigquery/convert.rb b/google-cloud-bigquery/lib/google/cloud/bigquery/convert.rb
@@ -318,7 +318,9 @@ def self.source_format format
             "parquet"                => "PARQUET",
             "datastore"              => "DATASTORE_BACKUP",
             "backup"                 => "DATASTORE_BACKUP",
-            "datastore_backup"       => "DATASTORE_BACKUP"
+            "datastore_backup"       => "DATASTORE_BACKUP",
+            "ml_tf_saved_model"      => "ML_TF_SAVED_MODEL",
+            "ml_xgboost_booster"     => "ML_XGBOOST_BOOSTER"
           }[format.to_s.downcase]
           return val unless val.nil?
           format

diff --git a/google-cloud-bigquery/lib/google/cloud/bigquery/copy_job.rb b/google-cloud-bigquery/lib/google/cloud/bigquery/copy_job.rb
@@ -272,12 +272,21 @@ def encryption= val
           # Sets the labels to use for the job.
           #
           # @param [Hash] value A hash of user-provided labels associated with
-          #   the job. You can use these to organize and group your jobs. Label
-          #   keys and values can be no longer than 63 characters, can only
-          #   contain lowercase letters, numeric characters, underscores and
-          #   dashes. International characters are allowed. Label values are
-          #   optional. Label keys must start with a letter and each label in
-          #   the list must have a different key.
+          #   the job. You can use these to organize and group your jobs.
+          #
+          #   The labels applied to a resource must meet the following requirements:
+          #
+          #   * Each resource can have multiple labels, up to a maximum of 64.
+          #   * Each label must be a key-value pair.
+          #   * Keys have a minimum length of 1 character and a maximum length of
+          #     63 characters, and cannot be empty. Values can be empty, and have
+          #     a maximum length of 63 characters.
+          #   * Keys and values can contain only lowercase letters, numeric characters,
+          #     underscores, and dashes. All characters must use UTF-8 encoding, and
+          #     international characters are allowed.
+          #   * The key portion of a label must be unique. However, you can use the
+          #     same key with multiple resources.
+          #   * Keys must start with a lowercase letter or international character.
           #
           # @!group Attributes
           def labels= value

diff --git a/google-cloud-bigquery/lib/google/cloud/bigquery/dataset.rb b/google-cloud-bigquery/lib/google/cloud/bigquery/dataset.rb
@@ -313,12 +313,19 @@ def labels
         # @param [Hash<String, String>] labels A hash containing key/value
         #   pairs.
         #
-        #   * Label keys and values can be no longer than 63 characters.
-        #   * Label keys and values can contain only lowercase letters, numbers,
-        #     underscores, hyphens, and international characters.
-        #   * Label keys and values cannot exceed 128 bytes in size.
-        #   * Label keys must begin with a letter.
-        #   * Label keys must be unique within a dataset.
+        #   The labels applied to a resource must meet the following requirements:
+        #
+        #   * Each resource can have multiple labels, up to a maximum of 64.
+        #   * Each label must be a key-value pair.
+        #   * Keys have a minimum length of 1 character and a maximum length of
+        #     63 characters, and cannot be empty. Values can be empty, and have
+        #     a maximum length of 63 characters.
+        #   * Keys and values can contain only lowercase letters, numeric characters,
+        #     underscores, and dashes. All characters must use UTF-8 encoding, and
+        #     international characters are allowed.
+        #   * The key portion of a label must be unique. However, you can use the
+        #     same key with multiple resources.
+        #   * Keys must start with a lowercase letter or international character.
         #
         # @example
         #   require "google/cloud/bigquery"
@@ -1171,13 +1178,21 @@ def routines token: nil, max: nil, filter: nil
         #   is 1,024 characters. If `job_id` is provided, then `prefix` will not
         #   be used.
         # @param [Hash] labels A hash of user-provided labels associated with
-        #   the job. You can use these to organize and group your jobs. Label
-        #   keys and values can be no longer than 63 characters, can only
-        #   contain lowercase letters, numeric characters, underscores and
-        #   dashes. International characters are allowed. Label values are
-        #   optional. Label keys must start with a letter and each label in the
-        #   list must have a different key. See [Requirements for
-        #   labels](https://cloud.google.com/bigquery/docs/creating-managing-labels#requirements).
+        #   the job. You can use these to organize and group your jobs.
+        #
+        #   The labels applied to a resource must meet the following requirements:
+        #
+        #   * Each resource can have multiple labels, up to a maximum of 64.
+        #   * Each label must be a key-value pair.
+        #   * Keys have a minimum length of 1 character and a maximum length of
+        #     63 characters, and cannot be empty. Values can be empty, and have
+        #     a maximum length of 63 characters.
+        #   * Keys and values can contain only lowercase letters, numeric characters,
+        #     underscores, and dashes. All characters must use UTF-8 encoding, and
+        #     international characters are allowed.
+        #   * The key portion of a label must be unique. However, you can use the
+        #     same key with multiple resources.
+        #   * Keys must start with a lowercase letter or international character.
         # @param [Array<String>, String] udfs User-defined function resources
         #   used in a legacy SQL query. May be either a code resource to load from
         #   a Google Cloud Storage URI (`gs://bucket/path`), or an inline resource
@@ -1792,13 +1807,21 @@ def external url, format: nil
         #   is 1,024 characters. If `job_id` is provided, then `prefix` will not
         #   be used.
         # @param [Hash] labels A hash of user-provided labels associated with
-        #   the job. You can use these to organize and group your jobs. Label
-        #   keys and values can be no longer than 63 characters, can only
-        #   contain lowercase letters, numeric characters, underscores and
-        #   dashes. International characters are allowed. Label values are
-        #   optional. Label keys must start with a letter and each label in the
-        #   list must have a different key. See [Requirements for
-        #   labels](https://cloud.google.com/bigquery/docs/creating-managing-labels#requirements).
+        #   the job. You can use these to organize and group your jobs.
+        #
+        #   The labels applied to a resource must meet the following requirements:
+        #
+        #   * Each resource can have multiple labels, up to a maximum of 64.
+        #   * Each label must be a key-value pair.
+        #   * Keys have a minimum length of 1 character and a maximum length of
+        #     63 characters, and cannot be empty. Values can be empty, and have
+        #     a maximum length of 63 characters.
+        #   * Keys and values can contain only lowercase letters, numeric characters,
+        #     underscores, and dashes. All characters must use UTF-8 encoding, and
+        #     international characters are allowed.
+        #   * The key portion of a label must be unique. However, you can use the
+        #     same key with multiple resources.
+        #   * Keys must start with a lowercase letter or international character.
         # @yield [updater] A block for setting the schema and other
         #   options for the destination table. The schema can be omitted if the
         #   destination table already exists, or if you're loading data from a