Skip to content

Commit

Permalink
Update python API
Browse files Browse the repository at this point in the history
  • Loading branch information
mn-mikke committed Apr 6, 2022
1 parent dce38fe commit 89d3df6
Show file tree
Hide file tree
Showing 15 changed files with 135 additions and 38 deletions.
Expand Up @@ -22,7 +22,13 @@ import ai.h2o.sparkling.api.generation.common.{EntitySubstitutionContext, ModelM
object MetricsInitTemplate extends ((Seq[ModelMetricsSubstitutionContext]) => String) with PythonEntityTemplate {

def apply(metricSubstitutionContexts: Seq[ModelMetricsSubstitutionContext]): String = {
val metricClasses = metricSubstitutionContexts.map(_.entityName)
val metricClasses = metricSubstitutionContexts.map { metricSubstitutionContext =>
if (metricSubstitutionContext.entityName.endsWith("Base")) {
metricSubstitutionContext.entityName.substring(0, metricSubstitutionContext.entityName.length - 4)
} else {
metricSubstitutionContext.entityName
}
}
val imports = metricClasses.map(metricClass => s"ai.h2o.sparkling.ml.metrics.$metricClass.$metricClass")

val entitySubstitutionContext = EntitySubstitutionContext(null, null, null, imports)
Expand Down
1 change: 1 addition & 0 deletions py-scoring/src/ai/h2o/sparkling/ml/__init__.py
Expand Up @@ -20,3 +20,4 @@
from ai.h2o.sparkling.ml.models import H2ODeepLearningMOJOModel, H2ODRFMOJOModel, H2OIsolationForestMOJOModel, H2OPCAMOJOModel, H2OGLRMMOJOModel
from ai.h2o.sparkling.ml.models import H2OMOJOModel, H2OAlgorithmMOJOModel, H2OFeatureMOJOModel, H2OMOJOPipelineModel, H2OMOJOSettings
from ai.h2o.sparkling.ml.models import H2OCoxPHMOJOModel, H2ORuleFitMOJOModel, H2OWord2VecMOJOModel
from ai.h2o.sparkling.ml.metrics import H2ORegressionMetrics, H2OBinomialMetrics, H2OMultinomialMetrics
26 changes: 20 additions & 6 deletions py-scoring/src/ai/h2o/sparkling/ml/metrics/H2OBinomialMetrics.py
Expand Up @@ -30,12 +30,26 @@ def calculate(dataFrame,
labelCol = "label",
weightCol = None,
offsetCol = None):
'''
The method calculates binomial metrics on a provided data frame with predictions and actual values.
:param dataFrame: A data frame with predictions and actual values
:param domain: A list of classes representing negative and positive response. Negative class must at position 0
and positive at 1
:param predictionCol: The name of prediction column. The prediction column must have the same type as
a detailed_prediction column coming from the transform method of H2OMOJOModel descendant or a array type or
vector of doubles. First item is must be 0.0 or 1.0 representing negative or positive response. The other items
must be probabilities to predict given probability classes.
:param labelCol: The name of label column that contains actual values.
:param weightCol: The name of a weight column.
:param offsetCol: The name of a offset column.
:return: Calculated binomial metrics
'''
# We need to make sure that Sparkling Water classes are available on the Spark driver and executor paths
Initializer.load_sparkling_jar()
javaMetrics = _jvm().ai.h2o.sparkling.ml.metrics.H2OBinomialMetrics.calculate(dataFrame,
domain,
predictionCol,
labelCol,
weightCol,
offsetCol)
javaMetrics = _jvm().ai.h2o.sparkling.ml.metrics.H2OBinomialMetrics.calculateInternal(dataFrame._jdf,
domain,
predictionCol,
labelCol,
weightCol,
offsetCol)
return H2OBinomialMetrics(javaMetrics)
Expand Up @@ -30,12 +30,31 @@ def calculate(dataFrame,
labelCol = "label",
weightCol = None,
aucType = "AUTO"):
'''
The method calculates multinomial metrics on a provided data frame with predictions and actual values.
:param dataFrame: A data frame with predictions and actual values.
:param domain: List of response classes.
:param predictionCol: The name of prediction column. The prediction column must have the same type as
a detailed_prediction column coming from the transform method of H2OMOJOModel descendant or a array type or
vector of doubles. First item is must be 0.0, 1.0, 2.0 representing indexes of response classes. The other
items must be probabilities to predict given probability classes.
:param labelCol: The name of label column that contains actual values.
:param weightCol: The name of a weight column.
:param aucType: Type of multinomial AUC/AUCPR calculation. Possible values:
- AUTO,
- NONE,
- MACRO_OVR,
- WEIGHTED_OVR,
- MACRO_OVO,
- WEIGHTED_OVO
:return: Calculated multinomial metrics
'''
# We need to make sure that Sparkling Water classes are available on the Spark driver and executor paths
Initializer.load_sparkling_jar()
javaMetrics = _jvm().ai.h2o.sparkling.ml.metrics.H2OMultinomialMetrics.calculate(dataFrame,
domain,
predictionCol,
labelCol,
weightCol,
aucType)
javaMetrics = _jvm().ai.h2o.sparkling.ml.metrics.H2OMultinomialMetrics.calculateInternal(dataFrame._jdf,
domain,
predictionCol,
labelCol,
weightCol,
aucType)
return H2OMultinomialMetrics(javaMetrics)
23 changes: 16 additions & 7 deletions py-scoring/src/ai/h2o/sparkling/ml/metrics/H2ORegressionMetrics.py
Expand Up @@ -25,17 +25,26 @@ class H2ORegressionMetrics(H2ORegressionMetricsBase):

@staticmethod
def calculate(dataFrame,
domain,
predictionCol = "detailed_prediction",
labelCol = "label",
weightCol = None,
offsetCol = None):
'''
The method calculates regression metrics on a provided data frame with predictions and actual values.
:param dataFrame: A data frame with predictions and actual values
:param predictionCol: The name of prediction column. The prediction column must have the same type as
a detailed_prediction column coming from the transform method of H2OMOJOModel descendant or
it must be of DoubleType or FloatType.
:param labelCol: The name of label column that contains actual values.
:param weightCol: The name of a weight column.
:param offsetCol: The name of a offset column.
:return: Calculated regression metrics
'''
# We need to make sure that Sparkling Water classes are available on the Spark driver and executor paths
Initializer.load_sparkling_jar()
javaMetrics = _jvm().ai.h2o.sparkling.ml.metrics.H2ORegressionMetrics.calculate(dataFrame,
domain,
predictionCol,
labelCol,
weightCol,
offsetCol)
javaMetrics = _jvm().ai.h2o.sparkling.ml.metrics.H2ORegressionMetrics.calculateInternal(dataFrame._jdf,
predictionCol,
labelCol,
weightCol,
offsetCol)
return H2ORegressionMetrics(javaMetrics)
4 changes: 3 additions & 1 deletion py-scoring/src/pysparkling/ml/__init__.py
Expand Up @@ -16,12 +16,14 @@
#

from pysparkling.ml.models import *
from pysparkling.ml.metrics import *

__all__ = ["H2OMOJOModel", "H2OSupervisedMOJOModel", "H2OTreeBasedSupervisedMOJOModel", "H2OUnsupervisedMOJOModel",
"H2OTreeBasedUnsupervisedMOJOModel", "H2OMOJOPipelineModel", "H2OMOJOSettings", "H2OBinaryModel",
"H2OKMeansMOJOModel", "H2OGLMMOJOModel", "H2OGAMMOJOModel", "H2OGBMMOJOModel", "H2OXGBoostMOJOModel",
"H2ODeepLearningMOJOModel", "H2ODRFMOJOModel", "H2OIsolationForestMOJOModel", "H2OPCAMOJOModel",
"H2OGLRMMOJOModel", "H2OCoxPHMOJOModel", "H2ORuleFitMOJOModel", "H2OWord2VecMOJOModel"]
"H2OGLRMMOJOModel", "H2OCoxPHMOJOModel", "H2ORuleFitMOJOModel", "H2OWord2VecMOJOModel",
"H2ORegressionMetrics", "H2OMultinomialMetrics", "H2OBinomialMetrics"]

from pysparkling.initializer import Initializer

Expand Down
20 changes: 20 additions & 0 deletions py-scoring/src/pysparkling/ml/metrics/__init__.py
@@ -0,0 +1,20 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from ai.h2o.sparkling.ml.metrics import H2ORegressionMetrics, H2OMultinomialMetrics, H2OBinomialMetrics

__all__ = ["H2ORegressionMetrics", "H2OMultinomialMetrics", "H2OBinomialMetrics"]
1 change: 1 addition & 0 deletions py/src/ai/h2o/sparkling/ml/__init__.py
Expand Up @@ -27,3 +27,4 @@
from ai.h2o.sparkling.ml.models import H2ODeepLearningMOJOModel, H2OWord2VecMOJOModel, H2OAutoEncoderMOJOModel, H2ODRFMOJOModel, H2OPCAMOJOModel, H2OGLRMMOJOModel
from ai.h2o.sparkling.ml.models import H2OIsolationForestMOJOModel, H2OCoxPHMOJOModel, H2ORuleFitMOJOModel, H2OStackedEnsembleMOJOModel
from ai.h2o.sparkling.ml.models import H2OMOJOModel, H2OAlgorithmMOJOModel, H2OFeatureMOJOModel, H2OMOJOPipelineModel, H2OMOJOSettings
from ai.h2o.sparkling.ml.metrics import H2ORegressionMetrics, H2OBinomialMetrics, H2OMultinomialMetrics
3 changes: 2 additions & 1 deletion py/src/pysparkling/ml/__init__.py
Expand Up @@ -19,6 +19,7 @@
from pysparkling.ml.algos.regression import *
from pysparkling.ml.features import *
from pysparkling.ml.models import *
from pysparkling.ml.metrics import *

__all__ = ["ColumnPruner", "H2OGBM", "H2ODeepLearning", "H2OAutoML", "H2OXGBoost", "H2OGLM", "H2OCoxPH", "H2OGAM",
"H2OMOJOModel", "H2OAlgorithmMOJOModel", "H2OFeatureMOJOModel", "H2OSupervisedMOJOModel",
Expand All @@ -32,7 +33,7 @@
"H2ODRFMOJOModel", "H2OIsolationForestMOJOModel", "H2OWord2Vec", "H2OWord2VecMOJOModel", "H2OAutoEncoder",
"H2OAutoEncoderMOJOModel", "H2OPCA", "H2OPCAMOJOModel", "H2OGLRM", "H2OGLRMMOJOModel", "H2ORuleFit",
"H2ORuleFitClassifier", "H2ORuleFitRegressor", "H2ORuleFitMOJOModel", "H2OStackedEnsemble",
"H2OStackedEnsembleMOJOModel"]
"H2OStackedEnsembleMOJOModel", "H2ORegressionMetrics", "H2OBinomialMetrics", "H2OMultinomialMetrics"]

from pysparkling.initializer import Initializer

Expand Down
21 changes: 21 additions & 0 deletions py/src/pysparkling/ml/metrics/__init__.py
@@ -0,0 +1,21 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from ai.h2o.sparkling.ml.metrics import H2ORegressionMetrics, H2OBinomialMetrics, H2OMultinomialMetrics


__all__ = ["H2ORegressionMetrics", "H2OBinomialMetrics", "H2OMultinomialMetrics"]
5 changes: 5 additions & 0 deletions py/tests/unit/with_runtime_sparkling/conftest.py
Expand Up @@ -60,6 +60,11 @@ def irisDatasetPath():
return "file://" + os.path.abspath("../examples/smalldata/iris/iris_wheader.csv")


@pytest.fixture(scope="module")
def irisDataset(spark, irisDatasetPath):
return spark.read.csv(irisDatasetPath, header=True, inferSchema=True)


@pytest.fixture(scope="module")
def airlinesDatasetPath():
return "file://" + os.path.abspath("../examples/smalldata/airlines/allyears2k_headers.csv")
Expand Down
Expand Up @@ -17,24 +17,19 @@
import os
from pysparkling.ml import *

from ai.h2o.sparkling.ml.models.H2OBinomialMetrics import H2OBinomialMetrics
from ai.h2o.sparkling.ml.models.H2OMultinomialMetrics import H2OMultinomialMetrics
from ai.h2o.sparkling.ml.models.H2ORegressionMetrics import H2ORegressionMetrics
from ai.h2o.sparkling.ml.models.H2OMOJOModel import H2OMOJOModel


def testRegressionMetricsCalculation(prostateDataset):
mojo = H2OMOJOModel.createFromMojo(
"file://" + os.path.abspath("../ml/src/test/resources/regre_model_prostate.mojo"))
metrics = H2ORegressionMetrics.calculate(mojo.transform(prostateDataset), labelCol = "capsule")
metrics = H2ORegressionMetrics.calculate(mojo.transform(prostateDataset), labelCol = "CAPSULE")
assert metrics is not None


def testBinomialMetricsCalculation(prostateDataset):
mojo = H2OMOJOModel.createFromMojo(
"file://" + os.path.abspath("../ml/src/test/resources/binom_model_prostate.mojo"))
domain = mojo.getDomainValues()["capsule"]
metrics = H2OBinomialMetrics.calculate(mojo.transform(prostateDataset), domain, labelCol = "capsule")
metrics = H2OBinomialMetrics.calculate(mojo.transform(prostateDataset), domain, labelCol = "CAPSULE")
assert metrics is not None


Expand Down
Expand Up @@ -68,14 +68,15 @@ object H2OBinomialMetrics extends MetricCalculation {
result
}

def calculate(
// The method serves for call from Python/R API
def calculateInternal(
dataFrame: DataFrame,
domain: Array[String],
domain: java.util.ArrayList[String],
predictionCol: String,
labelCol: String,
weightCol: String,
offsetCol: String): Unit = {
calculate(dataFrame, domain, predictionCol, labelCol, Option(weightCol), Option(offsetCol))
calculate(dataFrame, domain.toArray[String](new Array[String](0)), predictionCol, labelCol, Option(weightCol), Option(offsetCol))
}

override protected def getPredictionValues(dataType: DataType, domain: Array[String], row: Row): Array[Double] = {
Expand Down
Expand Up @@ -22,7 +22,7 @@ import hex.MultinomialAucType
import org.apache.spark.{ExposeUtils, ml, mllib}
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.{DataFrame, Dataset, Row}
import org.apache.spark.sql.types.{ArrayType, DataType, DoubleType, FloatType, StringType, StructType}

@MetricsDescription(
Expand All @@ -38,7 +38,7 @@ object H2OMultinomialMetrics extends MetricCalculation {
/**
* The method calculates multinomial metrics on a provided data frame with predictions and actual values.
*
* @param dataFrame A data frame with predictions and actual values
* @param dataFrame A data frame with predictions and actual values.
* @param domain Array of response classes.
* @param predictionCol The name of prediction column. The prediction column must have the same type as
* a detailed_prediction column coming from the transform method of H2OMOJOModel descendant or
Expand Down Expand Up @@ -77,14 +77,15 @@ object H2OMultinomialMetrics extends MetricCalculation {
result
}

def calculate(
// The method serves for call from Python/R API
def calculateInternal(
dataFrame: DataFrame,
domain: Array[String],
domain: java.util.ArrayList[String],
predictionCol: String,
labelCol: String,
weightCol: String,
aucType: String): H2OMultinomialMetrics = {
calculate(dataFrame, domain, predictionCol, labelCol, Option(weightCol), aucType)
calculate(dataFrame, domain.toArray[String](new Array[String](0)), predictionCol, labelCol, Option(weightCol), aucType)
}

override protected def getPredictionValues(dataType: DataType, domain: Array[String], row: Row): Array[Double] = {
Expand Down
Expand Up @@ -63,7 +63,8 @@ object H2ORegressionMetrics extends MetricCalculation {
result
}

def calculate(
// The method serves for call from Python/R API
def calculateInternal(
dataFrame: DataFrame,
predictionCol: String,
labelCol: String,
Expand Down

0 comments on commit 89d3df6

Please sign in to comment.