h2oai · krasinski · Dec 4, 2023 · Dec 11, 2023 · mn-mikke · Dec 11, 2023
@@ -29,23 +29,26 @@ import hex.schemas.DRFModelV3.DRFModelOutputV3
 import hex.schemas.DeepLearningModelV3.DeepLearningModelOutputV3
 import hex.schemas.DeepLearningV3.{DeepLearningParametersV3 => DLParamsV3}
 import hex.schemas.ExtendedIsolationForestModelV3.ExtendedIsolationForestModelOutputV3
+import hex.schemas.ExtendedIsolationForestV3.{ExtendedIsolationForestParametersV3 => ExtIFParamsV3}
 import hex.schemas.GAMModelV3.GAMModelOutputV3
 import hex.schemas.GBMModelV3.GBMModelOutputV3
 import hex.schemas.GLMModelV3.GLMModelOutputV3
 import hex.schemas.IsolationForestModelV3.IsolationForestModelOutputV3
 import hex.schemas.IsolationForestV3.{IsolationForestParametersV3 => IFParamsV3}
-import hex.schemas.ExtendedIsolationForestV3.{ExtendedIsolationForestParametersV3 => ExtIFParamsV3}
 import hex.schemas.KMeansModelV3.KMeansModelOutputV3
 import hex.schemas.KMeansV3.{KMeansParametersV3 => KMeansParamsV3}
 import hex.schemas.RuleFitModelV3.RuleFitModelOutputV3
 import hex.schemas.RuleFitV3.RuleFitParametersV3
+import hex.schemas.UpliftDRFModelV3.UpliftDRFModelOutputV3
+import hex.schemas.UpliftDRFV3.{UpliftDRFParametersV3 => UpliftParams}
 import hex.schemas.XGBoostModelV3.XGBoostModelOutputV3
 import hex.schemas.XGBoostV3.{XGBoostParametersV3 => XGBParamsV3}
 import hex.schemas.{DRFV3, GAMV3, GBMV3, GLMV3}
 import hex.tree.drf.DRFModel.DRFParameters
 import hex.tree.gbm.GBMModel.GBMParameters
 import hex.tree.isofor.IsolationForestModel.{IsolationForestParameters => IFParameters}
 import hex.tree.isoforextended.ExtendedIsolationForestModel.{ExtendedIsolationForestParameters => ExtIFParams}
+import hex.tree.uplift.UpliftDRFModel.UpliftDRFParameters
 import hex.tree.xgboost.XGBoostModel.XGBoostParameters
 
 import java.util
@@ -97,6 +100,10 @@ class AlgorithmConfigurations extends MultipleAlgorithmsConfiguration {
     val gamFields = Seq(ignoredCols, betaConstraints, gamCols)
     val gbmFields = Seq(monotonicity, calibrationDataFrame, ignoredCols)
     val drfFields = Seq(calibrationDataFrame, ignoredCols)
+    val upliftDrfFields = Seq(
+      ExplicitField("treatment_column", "HasTreatmentCol", "treatment"),
+      ExplicitField("response_column", "HasLabelCol", "label"),
+      ignoredCols)
     val kmeansFields = Seq(userPoints, ignoredCols)
     val coxPHFields = Seq(ignoredCols, interactionPairs)
     val ifFields = Seq(ignoredCols, calibrationDataFrame, validationLabelCol)
@@ -118,6 +125,7 @@ class AlgorithmConfigurations extends MultipleAlgorithmsConfiguration {
       ("H2OXGBoostParams", classOf[XGBParamsV3], classOf[XGBoostParameters], xgboostFields, noDeprecation),
       ("H2OGBMParams", classOf[GBMV3.GBMParametersV3], classOf[GBMParameters], gbmFields, noDeprecation),
       ("H2ODRFParams", classOf[DRFV3.DRFParametersV3], classOf[DRFParameters], drfFields, noDeprecation),
+      ("H2OUpliftDRFParams", classOf[UpliftParams], classOf[UpliftDRFParameters], upliftDrfFields, noDeprecation),
       ("H2OGLMParams", classOf[GLMV3.GLMParametersV3], classOf[GLMParameters], glmFields, noDeprecation),
       ("H2OGAMParams", classOf[GAMV3.GAMParametersV3], classOf[GAMParameters], gamFields, noDeprecation),
       ("H2ODeepLearningParams", classOf[DLParamsV3], classOf[DeepLearningParameters], dlFields, noDeprecation),
@@ -158,6 +166,12 @@ class AlgorithmConfigurations extends MultipleAlgorithmsConfiguration {
       ("H2OXGBoost", classOf[XGBoostParameters], treeSupervised, Seq(withDistribution), None),
       ("H2OGBM", classOf[GBMParameters], treeSupervised, Seq(withDistribution), None),
       ("H2ODRF", classOf[DRFParameters], treeSupervised, Seq(withDistribution), None),
+      (
+        "H2OUpliftDRF",
+        classOf[UpliftDRFParameters],
+        treeUnsupervised,
+        Seq(withDistribution, "H2OUpliftDRFExtras"),
+        None),
       ("H2OGLM", classOf[GLMParameters], cvSupervised, Seq(withFamily), Some("H2OGLMMetrics")),
       ("H2OGAM", classOf[GAMParameters], cvSupervised, Seq(withFamily), None),
       ("H2ODeepLearning", classOf[DeepLearningParameters], cvSupervised, Seq(withDistribution), None),
@@ -202,6 +216,7 @@ class AlgorithmConfigurations extends MultipleAlgorithmsConfiguration {
       ("H2OXGBoostModelOutputs", classOf[XGBoostModelOutputV3]),
       ("H2OGBMModelOutputs", classOf[GBMModelOutputV3]),
       ("H2ODRFModelOutputs", classOf[DRFModelOutputV3]),
+      ("H2OUpliftDRFModelOutputs", classOf[UpliftDRFModelOutputV3]),
       ("H2OGLMModelOutputs", classOf[GLMModelOutputV3]),
       ("H2OGAMModelOutputs", classOf[GAMModelOutputV3]),
       ("H2ODeepLearningModelOutputs", classOf[DeepLearningModelOutputV3]),

@@ -19,6 +19,7 @@
 	\begin{itemize}
 		\item DeepLearning
 		\item DRF
+		\item UpliftDRF
 		\item GBM
 		\item XGBoost
 		\item AutoML

@@ -282,6 +282,7 @@ The list of specific MOJO models:
 - ``H2OKMeansMOJOModel``
 - ``H2OIsolationForestMOJOModel``
 - ``H2OExtendedIsolationForestMOJOModel``
+- ``H2OUpliftDRFMOJOModel``
 - ``H2OCoxPHMOJOModel``
 - ``H2OTargetEncoderMOJOModel``
 - ``H2OAutoEncoderMOJOModel``

@@ -15,6 +15,7 @@ Machine Learning
    sw_xgboost
    sw_isolation_forest
    sw_extended_isolation_forest
+   sw_uplift_drf
    sw_coxph
    sw_deep_learning
    sw_rule_fit

@@ -0,0 +1,144 @@
+.. _uplift_drf:
+
+Train Distributed Uplift Random Forest (Uplift DRF) Model in Sparkling Water
+----------------------------------------------------------------------------
+
+Introduction
+~~~~~~~~~~~~
+Distributed Uplift Random Forest (Uplift DRF) is a classification tool for modeling uplift - the incremental impact of a treatment. Only binomial classification (distribution="bernoulli") is currently supported.
+Uplift DRF can be applied in fields where we operate with two groups of subjects. First group, let’s call it treatment, receive some kind of treatment (e.g. marketing campaign, medicine,…), and a second group, let’s call it control, is separated from the treatment. We also gather information about their response, whether they bought a product, recover from disease, or similar. Then, Uplift DRF trains so-called uplift trees.
+For more comprehensive description see `H2O-3 Distributed Uplift Random Forest (Uplift DRF) documentation <https://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/upliftdrf.html>`__.
+
+Example
+~~~~~~~
+
+The following section describes how to train the Distributed Uplift Random Forest (Uplift DRF) model in Sparkling Water in Scala & Python following the same example as H2O-3 documentation mentioned above. See also :ref:`parameters_H2OUpliftDRF`
+and :ref:`model_details_H2OUpliftDRFMOJOModel`.
+
+.. content-tabs::
+
+    .. tab-container:: Scala
+        :title: Scala
+
+        First, let's start Sparkling Shell as
+
+        .. code:: shell
+
+            ./bin/sparkling-shell
+
+        Start H2O cluster inside the Spark environment
+
+        .. code:: scala
+
+            import ai.h2o.sparkling._
+            import java.net.URI
+            val hc = H2OContext.getOrCreate()
+
+        Parse the data using H2O and convert them to Spark Frame
+
+        .. code:: scala
+
+            import org.apache.spark.SparkFiles
+            val datasetUrl = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/uplift/criteo_uplift_13k.csv"
+            spark.sparkContext.addFile(datasetUrl) //for example purposes, on a real cluster it's better to load directly from distributed storage
+            val sparkDF = spark.read.option("header", "true").option("inferSchema", "true").csv(SparkFiles.get("prostate.csv"))
+            val Array(trainingDF, testingDF) = sparkDF.randomSplit(Array(0.8, 0.2))
+
+        Train the model. You can configure all the available Distributed Uplift Random Forest (Uplift DRF) arguments using provided setters.
+
+        .. code:: scala
+
+            import ai.h2o.sparkling.ml.algos.H2OUpliftDRF
+
+            val predictorColumns = Array("f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8")
+            val responseColumn = "conversion"
+            val treatmentColumn = "treatment"
+
+            val algo = new H2OUpliftDRF()
+               .setNtrees(10)
+               .setMaxDepth(5)
+               .setTreatmentCol(treatmentColumn)
+               .setUpliftMetric("KL")
+               .setMinRows(10)
+               .setSeed(1234)
+               .setAuucType("qini")
+               .setLabelCol(responseColumn)
+               .setFeaturesCols(predictorColumns :+ treatmentColumn :+ responseColumn)
+
+            val model = algo.fit(trainingDF)
+
+        Run Predictions
+
+        .. code:: scala
+
+            model.transform(testingDF).show(truncate = false)
+
+        View model summary containing info about trained trees etc.
+
+        .. code:: scala
+
+            model.getModelSummary()
+
+        You can also get other model details by calling methods listed in :ref:`model_details_H2OUpliftDRFMOJOModel`.
+
+
+    .. tab-container:: Python
+        :title: Python
+
+        First, let's start PySparkling Shell as
+
+        .. code:: shell
+
+            ./bin/pysparkling
+
+        Start H2O cluster inside the Spark environment
+
+        .. code:: python
+
+            from pysparkling import *
+            hc = H2OContext.getOrCreate()
+
+        Parse the data using H2O and convert them to Spark Frame
+
+        .. code:: python
+
+            import h2o
+            frame = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/uplift/criteo_uplift_13k.csv")
+            sparkDF = hc.asSparkFrame(frame)
+            [trainingDF, testingDF] = sparkDF.randomSplit([0.8, 0.2])
+
+        Train the model. You can configure all the available UpliftDRF arguments using provided setters or constructor parameters.
+
+        .. code:: python
+
+            from pysparkling.ml import H2OUpliftDRF
+
+            treatmentColumn = "treatment"
+            responseColumn = "conversion"
+            predictors = ["f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", treatmentColumn, responseColumn]
+
+            algo = H2OUpliftDRF(featuresCols=predictors,
+                                ntrees = 10,
+                                maxDepth = 5,
+                                treatmentCol = treatmentColumn,
+                                upliftMetric = "KL",
+                                minRows = 10,
+                                seed = 1234,
+                                auucType = "qini",
+                                labelCol = responseColumn)
+
+            model = algo.fit(trainingDF)
+
+        Run Predictions
+
+        .. code:: python
+
+            model.transform(testingDF).show(truncate = False)
+
+        View model summary containing info about trained trees etc.
+
+        .. code:: python
+
+            model.getModelSummary()
+
+        You can also get other model details by calling methods listed in :ref:`model_details_H2OUpliftDRFMOJOModel`.