Skip to content

Commit

Permalink
[SPARK-36435][PYTHON] Implement MultIndex.equal_levels
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?
This PR proposes implementing `MultiIndex.equal_levels`.

```python
>>> psmidx1 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
>>> psmidx2 = ps.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c", "z")])
>>> psmidx1.equal_levels(psmidx2)
True

>>> psmidx1 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z"), ("a", "y")])
>>> psmidx2 = ps.MultiIndex.from_tuples([("a", "y"), ("b", "x"), ("c", "z"), ("c", "x")])
>>> psmidx1.equal_levels(psmidx2)
True
```

This was originally proposed in databricks/koalas#1789, and all reviews in origin PR has been resolved.

### Why are the changes needed?

We should support the pandas API as much as possible for pandas-on-Spark module.

### Does this PR introduce _any_ user-facing change?

Yes, the `MultiIndex.equal_levels` API is available.

### How was this patch tested?

Unittests

Closes #34113 from itholic/SPARK-36435.

Lead-authored-by: itholic <haejoon.lee@databricks.com>
Co-authored-by: Haejoon Lee <44108233+itholic@users.noreply.github.com>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
  • Loading branch information
2 people authored and HyukjinKwon committed Oct 1, 2021
1 parent 4aeddb8 commit 13ddc91
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 2 deletions.
1 change: 1 addition & 0 deletions python/docs/source/reference/pyspark.pandas/indexing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,7 @@ MultiIndex Modifying and computations
:toctree: api/

MultiIndex.equals
MultiIndex.equal_levels
MultiIndex.identical
MultiIndex.insert
MultiIndex.drop
Expand Down
37 changes: 36 additions & 1 deletion python/pyspark/pandas/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
#

from distutils.version import LooseVersion
from functools import partial
from functools import partial, reduce
from typing import Any, Callable, Iterator, List, Optional, Tuple, Union, cast, no_type_check

import pandas as pd
Expand Down Expand Up @@ -1137,6 +1137,41 @@ def intersection(self, other: Union[DataFrame, Series, Index, List]) -> "MultiIn
)
return cast(MultiIndex, DataFrame(internal).index)

def equal_levels(self, other: "MultiIndex") -> bool:
"""
Return True if the levels of both MultiIndex objects are the same
.. versionadded:: 3.3.0
Examples
--------
>>> psmidx1 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
>>> psmidx2 = ps.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c", "z")])
>>> psmidx1.equal_levels(psmidx2)
True
>>> psmidx2 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "j")])
>>> psmidx1.equal_levels(psmidx2)
False
"""
nlevels = self.nlevels
if nlevels != other.nlevels:
return False

self_sdf = self._internal.spark_frame
other_sdf = other._internal.spark_frame
subtract_list = []
for nlevel in range(nlevels):
self_index_scol = self._internal.index_spark_columns[nlevel]
other_index_scol = other._internal.index_spark_columns[nlevel]
self_subtract_other = self_sdf.select(self_index_scol).subtract(
other_sdf.select(other_index_scol)
)
subtract_list.append(self_subtract_other)

unioned_subtracts = reduce(lambda x, y: x.union(y), subtract_list)
return len(unioned_subtracts.head(1)) == 0

@property
def hasnans(self) -> bool:
raise NotImplementedError("hasnans is not defined for MultiIndex")
Expand Down
1 change: 0 additions & 1 deletion python/pyspark/pandas/missing/indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,6 @@ class MissingPandasLikeMultiIndex(object):
# Functions
argsort = _unsupported_function("argsort")
asof_locs = _unsupported_function("asof_locs")
equal_levels = _unsupported_function("equal_levels")
factorize = _unsupported_function("factorize")
format = _unsupported_function("format")
get_indexer = _unsupported_function("get_indexer")
Expand Down
35 changes: 35 additions & 0 deletions python/pyspark/pandas/tests/indexes/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2388,6 +2388,41 @@ def test_map(self):
lambda: psidx.map({1: 1, 2: 2.0, 3: "three"}),
)

def test_multiindex_equal_levels(self):
pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
pmidx2 = pd.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c", "z")])
psmidx1 = ps.from_pandas(pmidx1)
psmidx2 = ps.from_pandas(pmidx2)
self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))

pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "j")])
psmidx2 = ps.from_pandas(pmidx2)
self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))

pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("a", "x")])
psmidx2 = ps.from_pandas(pmidx2)
self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))

pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y")])
psmidx2 = ps.from_pandas(pmidx2)
self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))

pmidx2 = pd.MultiIndex.from_tuples([("a", "y"), ("b", "x"), ("c", "z")])
psmidx2 = ps.from_pandas(pmidx2)
self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))

pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z"), ("a", "y")])
pmidx2 = pd.MultiIndex.from_tuples([("a", "y"), ("b", "x"), ("c", "z"), ("c", "x")])
psmidx1 = ps.from_pandas(pmidx1)
psmidx2 = ps.from_pandas(pmidx2)
self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))

pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
pmidx2 = pd.MultiIndex.from_tuples([("a", "x", "q"), ("b", "y", "w"), ("c", "z", "e")])
psmidx1 = ps.from_pandas(pmidx1)
psmidx2 = ps.from_pandas(pmidx2)
self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))

def test_to_numpy(self):
pidx = pd.Index([1, 2, 3, 4])
psidx = ps.from_pandas(pidx)
Expand Down

0 comments on commit 13ddc91

Please sign in to comment.