[SPARK-36435][PYTHON] Implement MultIndex.equal_levels

### What changes were proposed in this pull request? This PR proposes implementing `MultiIndex.equal_levels`. ```python >>> psmidx1 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) >>> psmidx2 = ps.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c", "z")]) >>> psmidx1.equal_levels(psmidx2) True >>> psmidx1 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z"), ("a", "y")]) >>> psmidx2 = ps.MultiIndex.from_tuples([("a", "y"), ("b", "x"), ("c", "z"), ("c", "x")]) >>> psmidx1.equal_levels(psmidx2) True ``` This was originally proposed in databricks/koalas#1789, and all reviews in origin PR has been resolved. ### Why are the changes needed? We should support the pandas API as much as possible for pandas-on-Spark module. ### Does this PR introduce _any_ user-facing change? Yes, the `MultiIndex.equal_levels` API is available. ### How was this patch tested? Unittests Closes #34113 from itholic/SPARK-36435. Lead-authored-by: itholic <haejoon.lee@databricks.com> Co-authored-by: Haejoon Lee <44108233+itholic@users.noreply.github.com> Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
apache · Oct 1, 2021 · 13ddc91 · 13ddc91
1 parent 4aeddb8
commit 13ddc91
Show file tree

Hide file tree

Showing 4 changed files with 72 additions and 2 deletions.
diff --git a/python/docs/source/reference/pyspark.pandas/indexing.rst b/python/docs/source/reference/pyspark.pandas/indexing.rst
@@ -269,6 +269,7 @@ MultiIndex Modifying and computations
    :toctree: api/
 
    MultiIndex.equals
+   MultiIndex.equal_levels
    MultiIndex.identical
    MultiIndex.insert
    MultiIndex.drop

diff --git a/python/pyspark/pandas/indexes/multi.py b/python/pyspark/pandas/indexes/multi.py
@@ -16,7 +16,7 @@
 #
 
 from distutils.version import LooseVersion
-from functools import partial
+from functools import partial, reduce
 from typing import Any, Callable, Iterator, List, Optional, Tuple, Union, cast, no_type_check
 
 import pandas as pd
@@ -1137,6 +1137,41 @@ def intersection(self, other: Union[DataFrame, Series, Index, List]) -> "MultiIn
         )
         return cast(MultiIndex, DataFrame(internal).index)
 
+    def equal_levels(self, other: "MultiIndex") -> bool:
+        """
+        Return True if the levels of both MultiIndex objects are the same
+
+        .. versionadded:: 3.3.0
+
+        Examples
+        --------
+        >>> psmidx1 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
+        >>> psmidx2 = ps.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c", "z")])
+        >>> psmidx1.equal_levels(psmidx2)
+        True
+
+        >>> psmidx2 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "j")])
+        >>> psmidx1.equal_levels(psmidx2)
+        False
+        """
+        nlevels = self.nlevels
+        if nlevels != other.nlevels:
+            return False
+
+        self_sdf = self._internal.spark_frame
+        other_sdf = other._internal.spark_frame
+        subtract_list = []
+        for nlevel in range(nlevels):
+            self_index_scol = self._internal.index_spark_columns[nlevel]
+            other_index_scol = other._internal.index_spark_columns[nlevel]
+            self_subtract_other = self_sdf.select(self_index_scol).subtract(
+                other_sdf.select(other_index_scol)
+            )
+            subtract_list.append(self_subtract_other)
+
+        unioned_subtracts = reduce(lambda x, y: x.union(y), subtract_list)
+        return len(unioned_subtracts.head(1)) == 0
+
     @property
     def hasnans(self) -> bool:
         raise NotImplementedError("hasnans is not defined for MultiIndex")

diff --git a/python/pyspark/pandas/missing/indexes.py b/python/pyspark/pandas/missing/indexes.py
@@ -105,7 +105,6 @@ class MissingPandasLikeMultiIndex(object):
     # Functions
     argsort = _unsupported_function("argsort")
     asof_locs = _unsupported_function("asof_locs")
-    equal_levels = _unsupported_function("equal_levels")
     factorize = _unsupported_function("factorize")
     format = _unsupported_function("format")
     get_indexer = _unsupported_function("get_indexer")

diff --git a/python/pyspark/pandas/tests/indexes/test_base.py b/python/pyspark/pandas/tests/indexes/test_base.py
@@ -2388,6 +2388,41 @@ def test_map(self):
             lambda: psidx.map({1: 1, 2: 2.0, 3: "three"}),
         )
 
+    def test_multiindex_equal_levels(self):
+        pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
+        pmidx2 = pd.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c", "z")])
+        psmidx1 = ps.from_pandas(pmidx1)
+        psmidx2 = ps.from_pandas(pmidx2)
+        self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))
+
+        pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "j")])
+        psmidx2 = ps.from_pandas(pmidx2)
+        self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))
+
+        pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("a", "x")])
+        psmidx2 = ps.from_pandas(pmidx2)
+        self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))
+
+        pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y")])
+        psmidx2 = ps.from_pandas(pmidx2)
+        self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))
+
+        pmidx2 = pd.MultiIndex.from_tuples([("a", "y"), ("b", "x"), ("c", "z")])
+        psmidx2 = ps.from_pandas(pmidx2)
+        self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))
+
+        pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z"), ("a", "y")])
+        pmidx2 = pd.MultiIndex.from_tuples([("a", "y"), ("b", "x"), ("c", "z"), ("c", "x")])
+        psmidx1 = ps.from_pandas(pmidx1)
+        psmidx2 = ps.from_pandas(pmidx2)
+        self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))
+
+        pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
+        pmidx2 = pd.MultiIndex.from_tuples([("a", "x", "q"), ("b", "y", "w"), ("c", "z", "e")])
+        psmidx1 = ps.from_pandas(pmidx1)
+        psmidx2 = ps.from_pandas(pmidx2)
+        self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))
+
     def test_to_numpy(self):
         pidx = pd.Index([1, 2, 3, 4])
         psidx = ps.from_pandas(pidx)