neurodata · jshinm · May 26, 2022 · May 26, 2022 · May 26, 2022 · May 29, 2022
diff --git a/examples/ensemble/plot_oblique_axis_aligned_forests_cc18.py b/examples/ensemble/plot_oblique_axis_aligned_forests_cc18.py
@@ -0,0 +1,160 @@
+"""
+===============================================================================
+Plot oblique forest and axis-aligned random forest predictions on cc18 datasets
+===============================================================================
+
+A performance comparison between oblique forest and standard axis-
+aligned random forest using three datasets from OpenML benchmarking suites.
+
+Two of these datasets, namely [WDBC](https://www.openml.org/search?type=data&sort=runs&id=1510) 
+and [Phishing Website](https://www.openml.org/search?type=data&sort=runs&id=4534) datasets
+consist of 31 features where the former dataset is entirely numeric
+and the latter dataset is entirely norminal. The third dataset, dubbed 
+[cnae-9](https://www.openml.org/search?type=data&status=active&id=1468), is a
+numeric dataset that has notably large feature space of 857 features. As you
+will notice, of these three datasets, the oblique forest outperforms axis-aligned
+random forest on cnae-9 utilizing sparse random projection machanism. All datasets
+are subsampled due to computational constraints.
+"""
+
+import numpy as np
+import pandas as pd
+from datetime import datetime
+import openml
+import seaborn as sns
+import matplotlib.pyplot as plt
+from sklearn.ensemble import RandomForestClassifier, ObliqueRandomForestClassifier
+from sklearn.model_selection import train_test_split, RepeatedKFold, cross_validate
+
+random_state = 123456
+t0 = datetime.now()
+data_ids = [11, 40499]  # openml dataset id
+df = pd.DataFrame()
+
+
+def load_cc18(data_id):
+    dat = openml.datasets.get_dataset(data_id, download_data=False)
+    d_name = dat.name
+    d = dat.get_data()[0]
+
+    # Subsampling large datasets
+    n = int(d.shape[0] * 0.1)
+    d = d.sample(n, random_state=random_state)
+    X, y = d.iloc[:, :-1], d.iloc[:, -1]
+
+    return X, y, d_name
+
+
+def get_scores(X, y, d_name="UNK", n_cv=5, n_repeats=2, random_state=1, kwargs=None):
+    clfs = [
+        RandomForestClassifier(**kwargs[0], random_state=random_state),
+        ObliqueRandomForestClassifier(**kwargs[1], random_state=random_state),
+    ]
+
+    tmp = []
+
+    for i, clf in enumerate(clfs):
+        cv = RepeatedKFold(
+            n_splits=n_cv, n_repeats=n_repeats, random_state=random_state
+        )
+        test_score = cross_validate(estimator=clf, X=X, y=y, cv=cv, scoring="accuracy")
+
+        tmp.append(
+            [
+                d_name,
+                ["RF", "OF"][i],
+                test_score["test_score"],
+                test_score["test_score"].mean(),
+            ]
+        )
+        print(
+            f'{d_name} mean test score for {["RF", "OF"][i]}:'
+            f' {test_score["test_score"].mean()}'
+        )
+
+    df = pd.DataFrame(tmp, columns=["dataset", "model", "score", "mean"])
+    df = df.explode("score")
+    df["score"] = df["score"].astype(float)
+    df.reset_index(inplace=True, drop=True)
+
+    return df
+
+
+def load_best_params(data_ids):
+    # folder_path = "/home/jshinm/Desktop/workstation/sklearn-jms/notebook/hidden/output/"
+    folder_path = None
+    params = []
+
+    if not folder_path:
+        # pre-tuned hyper-parameters
+        params += [
+            [
+                {"max_depth": 5, "max_features": "sqrt", "n_estimators": 100},
+                {"max_depth": 5, "max_features": None, "n_estimators": 100},
+            ],
+            [
+                {"max_depth": 10, "max_features": "log2", "n_estimators": 200},
+                {"max_depth": 10, "max_features": 80, "n_estimators": 200},
+            ],
+        ]
+    else:
+        for data_id in data_ids:
+            file_path = f"OFvsRF_grid_search_cv_results_openml_{data_id}.csv"
+            df = pd.read_csv(folder_path + file_path).sort_values(
+                "mean_test_score", ascending=False
+            )
+            tmp = []
+            for clf in ["RF", "OF"]:
+                tmp.append(eval(df.query(f'clf=="{clf}"')["params"].iloc[0]))
+            params.append(tmp)
+
+    return params
+
+
+params = load_best_params(data_ids=data_ids)
+
+for i, data_id in enumerate(data_ids):
+    X, y, d_name = load_cc18(data_id=data_id)
+    print(f"Loading [{d_name}] dataset..")
+    tmp = get_scores(
+        X=X, y=y, d_name=d_name, random_state=random_state, kwargs=params[i]
+    )
+    df = pd.concat([df, tmp])
+
+t_d = (datetime.now() - t0).seconds
+print(f"It took {t_d} seconds to run the script")
+
+# Draw a comparison plot
+d_names = df.dataset.unique()
+N = d_names.shape[0]
+
+fig, ax = plt.subplots(1, N, figsize=(6 * N, 6))
+
+for i, name in enumerate(d_names):
+    if N == 1:
+        axs = ax
+    else:
+        axs = ax[i]
+    dff = df.query(f'dataset == "{name}"')
+
+    sns.stripplot(data=dff, x="model", y="score", ax=axs, dodge=True)
+    sns.boxplot(data=dff, x="model", y="score", ax=axs, color="white")
+    axs.set_title(f"{name} (#{data_ids[i]})")
+
+    rf = dff.query('model=="RF"')["mean"].iloc[0]
+    rff = f"RF (Mean Test Score: {round(rf,3)})"
+
+    of = dff.query('model=="OF"')["mean"].iloc[0]
+    off = f"OF (Mean Test Score: {round(of,3)})"
+
+    axs.legend([rff, off], loc=4)
+
+    if i != 0:
+        axs.set_ylabel("")
+    else:
+        axs.set_ylabel("Accuracy")
+
+    axs.set_xlabel("")
+
+plt.savefig(f"plot_cc18_{t_d}s.jpg")
+plt.show()
diff --git a/examples/ensemble/plot_oblique_axis_aligned_forests_sparse_parity.py b/examples/ensemble/plot_oblique_axis_aligned_forests_sparse_parity.py
@@ -0,0 +1,100 @@
+"""
+==========================================================================================
+Plot oblique forest and axis-aligned random forest predictions on sparse parity simulation
+==========================================================================================
+
+A performance comparison between oblique forest and standard axis-
+aligned random forest using sparse parity simulation dataset.
+
+Sparse parity is a variation of the noisy parity problem, 
+which itself is a multivariate generalization of the noisy XOR problem. 
+This is a binary classification task in high dimensions. The simulation 
+will generate uniformly distributed `n_samples` number of sample points 
+in the range of -1 and +1 with `p` number of features. `p*` is a 
+parameter used to limit features that carry information about the class. 
+The informative binary label is then defined as 1 if there are odd number 
+of the sum of data `X` across first `p*` features that are greater than 0, 
+otherwise the label is defined as 0. The simulation is further detailed 
+in this [publication](https://epubs.siam.org/doi/epdf/10.1137/1.9781611974973.56).
+"""
+
+import numpy as np
+import pandas as pd
+from datetime import datetime
+import seaborn as sns
+import matplotlib.pyplot as plt
+from sklearn.ensemble import RandomForestClassifier, ObliqueRandomForestClassifier
+from sklearn.model_selection import train_test_split, RepeatedKFold, cross_validate
+
+random_state = 123456
+t0 = datetime.now()
+
+
+def sparse_parity(n_samples, p=20, p_star=3, random_seed=None, **kwarg):
+    if random_seed:
+        np.random.seed(random_seed)
+
+    X = np.random.uniform(-1, 1, (n_samples, p))
+    y = np.zeros(n_samples)
+
+    for i in range(0, n_samples):
+        y[i] = sum(X[i, :p_star] > 0) % 2
+
+    return X, y
+
+
+def get_scores(X, y, n_cv=5, n_repeats=1, random_state=1, kwargs=None):
+    clfs = [
+        RandomForestClassifier(**kwargs[0], random_state=random_state),
+        ObliqueRandomForestClassifier(**kwargs[1], random_state=random_state),
+    ]
+
+    tmp = []
+
+    for i, clf in enumerate(clfs):
+        cv = RepeatedKFold(
+            n_splits=n_cv, n_repeats=n_repeats, random_state=random_state
+        )
+        test_score = cross_validate(estimator=clf, X=X, y=y, cv=cv, scoring="accuracy")
+
+        tmp.append(
+            [["RF", "OF"][i], test_score["test_score"], test_score["test_score"].mean()]
+        )
+
+    df = pd.DataFrame(tmp, columns=["model", "score", "mean"])
+    df = df.explode("score")
+    df["score"] = df["score"].astype(float)
+    df.reset_index(inplace=True, drop=True)
+
+    return df
+
+
+# Grid searched hyper-parameters
+params = [
+    {"max_features": None, "n_estimators": 100, "max_depth": None},
+    {"max_features": 40, "n_estimators": 100, "max_depth": 20},
+]
+
+X, y = sparse_parity(n_samples=10000, random_seed=random_state)
+
+df = get_scores(X=X, y=y, n_cv=3, n_repeats=1, random_state=random_state, kwargs=params)
+t_d = (datetime.now() - t0).seconds
+print(f"It took {t_d} seconds to run the script")
+
+# Draw a comparison plot
+fig, ax = plt.subplots(1, 1, figsize=(6, 6))
+
+sns.stripplot(data=df, x="model", y="score", ax=ax, dodge=True)
+sns.boxplot(data=df, x="model", y="score", ax=ax, color="white")
+ax.set_title("Sparse Parity")
+
+rf = df.query('model=="RF"')["mean"].iloc[0]
+rff = f"RF (Mean Test Score: {round(rf,3)})"
+
+of = df.query('model=="OF"')["mean"].iloc[0]
+off = f"OF (Mean Test Score: {round(of,3)})"
+
+ax.legend([rff, off], loc=4)
+
+plt.savefig(f"plot_sim_{t_d}s.jpg")
+plt.show()
diff --git a/examples/tree/plot_iris_dtc.py b/examples/tree/plot_iris_dtc.py
@@ -3,8 +3,8 @@
 Plot the decision surface of decision trees trained on the iris dataset
 =======================================================================
 
-Plot the decision surface of a decision tree trained on pairs
-of features of the iris dataset.
+Plot the decision surface of a decision tree and oblique decision tree 
+trained on pairs of features of the iris dataset.
 
 See :ref:`decision tree <tree>` for more information on the estimator.
 
@@ -27,61 +27,73 @@
 import matplotlib.pyplot as plt
 
 from sklearn.datasets import load_iris
-from sklearn.tree import DecisionTreeClassifier
+from sklearn.tree import DecisionTreeClassifier, ObliqueDecisionTreeClassifier
 from sklearn.inspection import DecisionBoundaryDisplay
 
 
 # Parameters
 n_classes = 3
 plot_colors = "ryb"
 plot_step = 0.02
-
-
-for pairidx, pair in enumerate([[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]]):
-    # We only take the two corresponding features
-    X = iris.data[:, pair]
-    y = iris.target
-
-    # Train
-    clf = DecisionTreeClassifier().fit(X, y)
-
-    # Plot the decision boundary
-    ax = plt.subplot(2, 3, pairidx + 1)
-    plt.tight_layout(h_pad=0.5, w_pad=0.5, pad=2.5)
-    DecisionBoundaryDisplay.from_estimator(
-        clf,
-        X,
-        cmap=plt.cm.RdYlBu,
-        response_method="predict",
-        ax=ax,
-        xlabel=iris.feature_names[pair[0]],
-        ylabel=iris.feature_names[pair[1]],
-    )
-
-    # Plot the training points
-    for i, color in zip(range(n_classes), plot_colors):
-        idx = np.where(y == i)
-        plt.scatter(
-            X[idx, 0],
-            X[idx, 1],
-            c=color,
-            label=iris.target_names[i],
+clf_labels = ['Random', 'Oblique']
+random_state = 123456
+
+clfs = [
+    DecisionTreeClassifier(random_state=random_state),
+    ObliqueDecisionTreeClassifier(random_state=random_state)
+]
+
+for clf, clf_lab in zip(clfs, clf_labels):
+
+    for pairidx, pair in enumerate([[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]]):
+        # We only take the two corresponding features
+        X = iris.data[:, pair]
+        y = iris.target
+
+        # Train
+        clf.fit(X, y)
+
+        # Plot the decision boundary
+        ax = plt.subplot(2, 3, pairidx + 1)
+        plt.tight_layout(h_pad=0.5, w_pad=0.5, pad=2.5)
+        DecisionBoundaryDisplay.from_estimator(
+            clf,
+            X,
             cmap=plt.cm.RdYlBu,
-            edgecolor="black",
-            s=15,
+            response_method="predict",
+            ax=ax,
+            xlabel=iris.feature_names[pair[0]],
+            ylabel=iris.feature_names[pair[1]],
         )
 
-plt.suptitle("Decision surface of decision trees trained on pairs of features")
-plt.legend(loc="lower right", borderpad=0, handletextpad=0)
-_ = plt.axis("tight")
+        # Plot the training points
+        for i, color in zip(range(n_classes), plot_colors):
+            idx = np.where(y == i)
+            plt.scatter(
+                X[idx, 0],
+                X[idx, 1],
+                c=color,
+                label=iris.target_names[i],
+                cmap=plt.cm.RdYlBu,
+                edgecolor="black",
+                s=15,
+            )
+
+    plt.suptitle(f"Decision surface of {clf_lab} decision trees trained on pairs of features")
+    plt.legend(loc="lower right", borderpad=0, handletextpad=0)
+    _ = plt.axis("tight")
+    plt.show()
 
 # %%
 # Display the structure of a single decision tree trained on all the features
 # together.
 from sklearn.tree import plot_tree
 
-plt.figure()
-clf = DecisionTreeClassifier().fit(iris.data, iris.target)
-plot_tree(clf, filled=True)
-plt.title("Decision tree trained on all the iris features")
-plt.show()
+for clf, clf_lab in zip(clfs, clf_labels):
+    plt.figure()
+    clf.fit(iris.data, iris.target)
+    plot_tree(clf, filled=True)
+    plt.title(f"{clf_lab} decision tree trained on all the iris features")
+    plt.show()
+
+# %%