Update python API

h2oai · Apr 6, 2022 · 89d3df6 · 89d3df6
1 parent dce38fe
commit 89d3df6
Show file tree

Hide file tree

Showing 15 changed files with 135 additions and 38 deletions.
diff --git a/...eneration/src/main/scala/ai/h2o/sparkling/api/generation/python/MetricsInitTemplate.scala b/...eneration/src/main/scala/ai/h2o/sparkling/api/generation/python/MetricsInitTemplate.scala
@@ -22,7 +22,13 @@ import ai.h2o.sparkling.api.generation.common.{EntitySubstitutionContext, ModelM
 object MetricsInitTemplate extends ((Seq[ModelMetricsSubstitutionContext]) => String) with PythonEntityTemplate {
 
   def apply(metricSubstitutionContexts: Seq[ModelMetricsSubstitutionContext]): String = {
-    val metricClasses = metricSubstitutionContexts.map(_.entityName)
+    val metricClasses = metricSubstitutionContexts.map { metricSubstitutionContext =>
+      if (metricSubstitutionContext.entityName.endsWith("Base")) {
+        metricSubstitutionContext.entityName.substring(0, metricSubstitutionContext.entityName.length - 4)
+      } else {
+        metricSubstitutionContext.entityName
+      }
+    }
     val imports = metricClasses.map(metricClass => s"ai.h2o.sparkling.ml.metrics.$metricClass.$metricClass")
 
     val entitySubstitutionContext = EntitySubstitutionContext(null, null, null, imports)

diff --git a/py-scoring/src/ai/h2o/sparkling/ml/__init__.py b/py-scoring/src/ai/h2o/sparkling/ml/__init__.py
@@ -20,3 +20,4 @@
 from ai.h2o.sparkling.ml.models import H2ODeepLearningMOJOModel, H2ODRFMOJOModel, H2OIsolationForestMOJOModel, H2OPCAMOJOModel, H2OGLRMMOJOModel
 from ai.h2o.sparkling.ml.models import H2OMOJOModel, H2OAlgorithmMOJOModel, H2OFeatureMOJOModel, H2OMOJOPipelineModel, H2OMOJOSettings
 from ai.h2o.sparkling.ml.models import H2OCoxPHMOJOModel, H2ORuleFitMOJOModel, H2OWord2VecMOJOModel
+from ai.h2o.sparkling.ml.metrics import H2ORegressionMetrics, H2OBinomialMetrics, H2OMultinomialMetrics
diff --git a/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.py b/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.py
@@ -30,12 +30,26 @@ def calculate(dataFrame,
                   labelCol = "label",
                   weightCol = None,
                   offsetCol = None):
+        '''
+        The method calculates binomial metrics on a provided data frame with predictions and actual values.
+        :param dataFrame: A data frame with predictions and actual values
+        :param domain: A list of classes representing negative and positive response. Negative class must at position 0
+        and positive at 1
+        :param predictionCol: The name of prediction column. The prediction column must have the same type as
+        a detailed_prediction column coming from the transform method of H2OMOJOModel descendant or a array type or
+        vector of doubles. First item is must be 0.0 or 1.0 representing negative or positive response. The other items
+        must be probabilities to predict given probability classes.
+        :param labelCol: The name of label column that contains actual values.
+        :param weightCol: The name of a weight column.
+        :param offsetCol: The name of a offset column.
+        :return: Calculated binomial metrics
+        '''
         # We need to make sure that Sparkling Water classes are available on the Spark driver and executor paths
         Initializer.load_sparkling_jar()
-        javaMetrics = _jvm().ai.h2o.sparkling.ml.metrics.H2OBinomialMetrics.calculate(dataFrame,
-                                                                                     domain,
-                                                                                     predictionCol,
-                                                                                     labelCol,
-                                                                                     weightCol,
-                                                                                     offsetCol)
+        javaMetrics = _jvm().ai.h2o.sparkling.ml.metrics.H2OBinomialMetrics.calculateInternal(dataFrame._jdf,
+                                                                                              domain,
+                                                                                              predictionCol,
+                                                                                              labelCol,
+                                                                                              weightCol,
+                                                                                              offsetCol)
         return H2OBinomialMetrics(javaMetrics)
diff --git a/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.py b/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.py
@@ -30,12 +30,31 @@ def calculate(dataFrame,
                   labelCol = "label",
                   weightCol = None,
                   aucType = "AUTO"):
+        '''
+        The method calculates multinomial metrics on a provided data frame with predictions and actual values.
+        :param dataFrame: A data frame with predictions and actual values.
+        :param domain: List of response classes.
+        :param predictionCol: The name of prediction column. The prediction column must have the same type as
+        a detailed_prediction column coming from the transform method of H2OMOJOModel descendant or a array type or
+        vector of doubles. First item is must be 0.0, 1.0, 2.0 representing indexes of response classes. The other
+        items must be probabilities to predict given probability classes.
+        :param labelCol: The name of label column that contains actual values.
+        :param weightCol: The name of a weight column.
+        :param aucType: Type of multinomial AUC/AUCPR calculation. Possible values:
+        - AUTO,
+        - NONE,
+        - MACRO_OVR,
+        - WEIGHTED_OVR,
+        - MACRO_OVO,
+        - WEIGHTED_OVO
+        :return: Calculated multinomial metrics
+        '''
         # We need to make sure that Sparkling Water classes are available on the Spark driver and executor paths
         Initializer.load_sparkling_jar()
-        javaMetrics = _jvm().ai.h2o.sparkling.ml.metrics.H2OMultinomialMetrics.calculate(dataFrame,
-                                                                                         domain,
-                                                                                         predictionCol,
-                                                                                         labelCol,
-                                                                                         weightCol,
-                                                                                         aucType)
+        javaMetrics = _jvm().ai.h2o.sparkling.ml.metrics.H2OMultinomialMetrics.calculateInternal(dataFrame._jdf,
+                                                                                                 domain,
+                                                                                                 predictionCol,
+                                                                                                 labelCol,
+                                                                                                 weightCol,
+                                                                                                 aucType)
         return H2OMultinomialMetrics(javaMetrics)
diff --git a/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.py b/py-scoring/src/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.py
@@ -25,17 +25,26 @@ class H2ORegressionMetrics(H2ORegressionMetricsBase):
 
     @staticmethod
     def calculate(dataFrame,
-                  domain,
                   predictionCol = "detailed_prediction",
                   labelCol = "label",
                   weightCol = None,
                   offsetCol = None):
+        '''
+        The method calculates regression metrics on a provided data frame with predictions and actual values.
+        :param dataFrame: A data frame with predictions and actual values
+        :param predictionCol: The name of prediction column. The prediction column must have the same type as
+        a detailed_prediction column coming from the transform method of H2OMOJOModel descendant or
+        it must be of DoubleType or FloatType.
+        :param labelCol: The name of label column that contains actual values.
+        :param weightCol: The name of a weight column.
+        :param offsetCol: The name of a offset column.
+        :return: Calculated regression metrics
+        '''
         # We need to make sure that Sparkling Water classes are available on the Spark driver and executor paths
         Initializer.load_sparkling_jar()
-        javaMetrics = _jvm().ai.h2o.sparkling.ml.metrics.H2ORegressionMetrics.calculate(dataFrame,
-                                                                                        domain,
-                                                                                        predictionCol,
-                                                                                        labelCol,
-                                                                                        weightCol,
-                                                                                        offsetCol)
+        javaMetrics = _jvm().ai.h2o.sparkling.ml.metrics.H2ORegressionMetrics.calculateInternal(dataFrame._jdf,
+                                                                                                predictionCol,
+                                                                                                labelCol,
+                                                                                                weightCol,
+                                                                                                offsetCol)
         return H2ORegressionMetrics(javaMetrics)
diff --git a/py-scoring/src/pysparkling/ml/__init__.py b/py-scoring/src/pysparkling/ml/__init__.py
@@ -16,12 +16,14 @@
 #
 
 from pysparkling.ml.models import *
+from pysparkling.ml.metrics import *
 
 __all__ = ["H2OMOJOModel", "H2OSupervisedMOJOModel", "H2OTreeBasedSupervisedMOJOModel", "H2OUnsupervisedMOJOModel",
            "H2OTreeBasedUnsupervisedMOJOModel", "H2OMOJOPipelineModel", "H2OMOJOSettings", "H2OBinaryModel",
            "H2OKMeansMOJOModel", "H2OGLMMOJOModel", "H2OGAMMOJOModel", "H2OGBMMOJOModel", "H2OXGBoostMOJOModel",
            "H2ODeepLearningMOJOModel", "H2ODRFMOJOModel", "H2OIsolationForestMOJOModel", "H2OPCAMOJOModel",
-           "H2OGLRMMOJOModel", "H2OCoxPHMOJOModel", "H2ORuleFitMOJOModel", "H2OWord2VecMOJOModel"]
+           "H2OGLRMMOJOModel", "H2OCoxPHMOJOModel", "H2ORuleFitMOJOModel", "H2OWord2VecMOJOModel",
+           "H2ORegressionMetrics", "H2OMultinomialMetrics", "H2OBinomialMetrics"]
 
 from pysparkling.initializer import Initializer
 

diff --git a/py-scoring/src/pysparkling/ml/metrics/__init__.py b/py-scoring/src/pysparkling/ml/metrics/__init__.py
@@ -0,0 +1,20 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from ai.h2o.sparkling.ml.metrics import H2ORegressionMetrics, H2OMultinomialMetrics, H2OBinomialMetrics
+
+__all__ = ["H2ORegressionMetrics", "H2OMultinomialMetrics", "H2OBinomialMetrics"]
diff --git a/py/src/ai/h2o/sparkling/ml/__init__.py b/py/src/ai/h2o/sparkling/ml/__init__.py
@@ -27,3 +27,4 @@
 from ai.h2o.sparkling.ml.models import H2ODeepLearningMOJOModel, H2OWord2VecMOJOModel, H2OAutoEncoderMOJOModel, H2ODRFMOJOModel, H2OPCAMOJOModel, H2OGLRMMOJOModel
 from ai.h2o.sparkling.ml.models import H2OIsolationForestMOJOModel, H2OCoxPHMOJOModel, H2ORuleFitMOJOModel, H2OStackedEnsembleMOJOModel
 from ai.h2o.sparkling.ml.models import H2OMOJOModel, H2OAlgorithmMOJOModel, H2OFeatureMOJOModel, H2OMOJOPipelineModel, H2OMOJOSettings
+from ai.h2o.sparkling.ml.metrics import H2ORegressionMetrics, H2OBinomialMetrics, H2OMultinomialMetrics
diff --git a/py/src/pysparkling/ml/__init__.py b/py/src/pysparkling/ml/__init__.py
@@ -19,6 +19,7 @@
 from pysparkling.ml.algos.regression import *
 from pysparkling.ml.features import *
 from pysparkling.ml.models import *
+from pysparkling.ml.metrics import *
 
 __all__ = ["ColumnPruner", "H2OGBM", "H2ODeepLearning", "H2OAutoML", "H2OXGBoost", "H2OGLM", "H2OCoxPH", "H2OGAM",
            "H2OMOJOModel", "H2OAlgorithmMOJOModel", "H2OFeatureMOJOModel", "H2OSupervisedMOJOModel",
@@ -32,7 +33,7 @@
            "H2ODRFMOJOModel", "H2OIsolationForestMOJOModel", "H2OWord2Vec", "H2OWord2VecMOJOModel", "H2OAutoEncoder",
            "H2OAutoEncoderMOJOModel", "H2OPCA", "H2OPCAMOJOModel", "H2OGLRM", "H2OGLRMMOJOModel", "H2ORuleFit",
            "H2ORuleFitClassifier", "H2ORuleFitRegressor", "H2ORuleFitMOJOModel", "H2OStackedEnsemble",
-           "H2OStackedEnsembleMOJOModel"]
+           "H2OStackedEnsembleMOJOModel", "H2ORegressionMetrics", "H2OBinomialMetrics", "H2OMultinomialMetrics"]
 
 from pysparkling.initializer import Initializer
 

diff --git a/py/src/pysparkling/ml/metrics/__init__.py b/py/src/pysparkling/ml/metrics/__init__.py
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from ai.h2o.sparkling.ml.metrics import H2ORegressionMetrics, H2OBinomialMetrics, H2OMultinomialMetrics
+
+
+__all__ = ["H2ORegressionMetrics", "H2OBinomialMetrics", "H2OMultinomialMetrics"]
diff --git a/py/tests/unit/with_runtime_sparkling/conftest.py b/py/tests/unit/with_runtime_sparkling/conftest.py
@@ -60,6 +60,11 @@ def irisDatasetPath():
     return "file://" + os.path.abspath("../examples/smalldata/iris/iris_wheader.csv")
 
 
+@pytest.fixture(scope="module")
+def irisDataset(spark, irisDatasetPath):
+    return spark.read.csv(irisDatasetPath, header=True, inferSchema=True)
+
+
 @pytest.fixture(scope="module")
 def airlinesDatasetPath():
     return "file://" + os.path.abspath("../examples/smalldata/airlines/allyears2k_headers.csv")

diff --git a/py/tests/unit/with_runtime_sparkling/test_metric_calculation.py b/py/tests/unit/with_runtime_sparkling/test_metric_calculation.py
@@ -17,24 +17,19 @@
 import os
 from pysparkling.ml import *
 
-from ai.h2o.sparkling.ml.models.H2OBinomialMetrics import H2OBinomialMetrics
-from ai.h2o.sparkling.ml.models.H2OMultinomialMetrics import H2OMultinomialMetrics
-from ai.h2o.sparkling.ml.models.H2ORegressionMetrics import H2ORegressionMetrics
-from ai.h2o.sparkling.ml.models.H2OMOJOModel import H2OMOJOModel
-
 
 def testRegressionMetricsCalculation(prostateDataset):
     mojo = H2OMOJOModel.createFromMojo(
         "file://" + os.path.abspath("../ml/src/test/resources/regre_model_prostate.mojo"))
-    metrics = H2ORegressionMetrics.calculate(mojo.transform(prostateDataset), labelCol = "capsule")
+    metrics = H2ORegressionMetrics.calculate(mojo.transform(prostateDataset), labelCol = "CAPSULE")
     assert metrics is not None
 
 
 def testBinomialMetricsCalculation(prostateDataset):
     mojo = H2OMOJOModel.createFromMojo(
         "file://" + os.path.abspath("../ml/src/test/resources/binom_model_prostate.mojo"))
     domain = mojo.getDomainValues()["capsule"]
-    metrics = H2OBinomialMetrics.calculate(mojo.transform(prostateDataset), domain, labelCol = "capsule")
+    metrics = H2OBinomialMetrics.calculate(mojo.transform(prostateDataset), domain, labelCol = "CAPSULE")
     assert metrics is not None
 
 

diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.scala
@@ -68,14 +68,15 @@ object H2OBinomialMetrics extends MetricCalculation {
     result
   }
 
-  def calculate(
+  // The method serves for call from Python/R API
+  def calculateInternal(
       dataFrame: DataFrame,
-      domain: Array[String],
+      domain: java.util.ArrayList[String],
       predictionCol: String,
       labelCol: String,
       weightCol: String,
       offsetCol: String): Unit = {
-    calculate(dataFrame, domain, predictionCol, labelCol, Option(weightCol), Option(offsetCol))
+    calculate(dataFrame, domain.toArray[String](new Array[String](0)), predictionCol, labelCol, Option(weightCol), Option(offsetCol))
   }
 
   override protected def getPredictionValues(dataType: DataType, domain: Array[String], row: Row): Array[Double] = {

diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2OMultinomialMetrics.scala
@@ -22,7 +22,7 @@ import hex.MultinomialAucType
 import org.apache.spark.{ExposeUtils, ml, mllib}
 import org.apache.spark.ml.util.Identifiable
 import org.apache.spark.sql.functions.col
-import org.apache.spark.sql.{DataFrame, Row}
+import org.apache.spark.sql.{DataFrame, Dataset, Row}
 import org.apache.spark.sql.types.{ArrayType, DataType, DoubleType, FloatType, StringType, StructType}
 
 @MetricsDescription(
@@ -38,7 +38,7 @@ object H2OMultinomialMetrics extends MetricCalculation {
   /**
     * The method calculates multinomial metrics on a provided data frame with predictions and actual values.
     *
-    * @param dataFrame A data frame with predictions and actual values
+    * @param dataFrame A data frame with predictions and actual values.
     * @param domain Array of response classes.
     * @param predictionCol   The name of prediction column. The prediction column must have the same type as
     *                        a detailed_prediction column coming from the transform method of H2OMOJOModel descendant or
@@ -77,14 +77,15 @@ object H2OMultinomialMetrics extends MetricCalculation {
     result
   }
 
-  def calculate(
+  // The method serves for call from Python/R API
+  def calculateInternal(
       dataFrame: DataFrame,
-      domain: Array[String],
+      domain: java.util.ArrayList[String],
       predictionCol: String,
       labelCol: String,
       weightCol: String,
       aucType: String): H2OMultinomialMetrics = {
-    calculate(dataFrame, domain, predictionCol, labelCol, Option(weightCol), aucType)
+    calculate(dataFrame, domain.toArray[String](new Array[String](0)), predictionCol, labelCol, Option(weightCol), aucType)
   }
 
   override protected def getPredictionValues(dataType: DataType, domain: Array[String], row: Row): Array[Double] = {

diff --git a/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.scala b/scoring/src/main/scala/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.scala
@@ -63,7 +63,8 @@ object H2ORegressionMetrics extends MetricCalculation {
     result
   }
 
-  def calculate(
+  // The method serves for call from Python/R API
+  def calculateInternal(
       dataFrame: DataFrame,
       predictionCol: String,
       labelCol: String,