Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

API: add return_inverse to pd.unique #24119

Closed
wants to merge 17 commits into from
Closed
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
19 changes: 19 additions & 0 deletions doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,25 @@ of the Series or columns of a DataFrame will also have string dtype.
We recommend explicitly using the ``string`` data type when working with strings.
See :ref:`text.types` for more.


.. _whatsnew_1000.enhancements.unique:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note, this was chopped off of #24108 and the section is intended to be bigger, compare here


Changes to the ``unique``-method
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

The method :meth:`pandas.unique` now supports the keyword ``return_inverse``, which, if passed,
makes the output a tuple where the second component is an ndarray that contains the
mapping from the indices of the values to their location in the return unique values.

.. ipython:: python

idx = pd.Index([1, 0, 0, 1])
uniques, inverse = pd.unique(idx, return_inverse=True)
uniques
inverse
reconstruct = pd.Index(uniques[inverse])
reconstruct.equals(idx)

.. _whatsnew_1000.enhancements.other:

Other enhancements
Expand Down
41 changes: 38 additions & 3 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ def match(to_match, values, na_sentinel=-1):
return result


def unique(values):
def unique(values, return_inverse=False):
"""
Hash table-based unique. Uniques are returned in order
of appearance. This does NOT sort.
Expand All @@ -316,6 +316,13 @@ def unique(values):
Parameters
----------
values : 1d array-like
return_inverse : boolean, default False
Whether to return the inverse of the unique values. If True, the
output will be a tuple of two np.ndarray. The second component
contains the mapping between the indices of the elements in the
calling Categorical and their locations in the unique values.

.. versionadded:: 1.0.0

Returns
-------
Expand Down Expand Up @@ -384,19 +391,47 @@ def unique(values):
>>> pd.unique([('a', 'b'), ('b', 'a'), ('a', 'c'), ('b', 'a')])
array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object)
"""
from pandas import Index

values = _ensure_arraylike(values)

if is_extension_array_dtype(values):
# Dispatch to extension dtype's unique.
if return_inverse:
# as long as return_inverse is not part of the EA.unique contract,
# test if this works
try:
# make sure that we're not calling from an Index/Series
# container, as these do not support return_inverse yet
ea_val = getattr(values, "array", values)
result, inverse = ea_val.unique(return_inverse=return_inverse)

if is_categorical_dtype(values) and isinstance(values, Index):
# pd.unique(CategoricalIndex) returns Index not Categorical
result = Index(result)
return result, inverse
except TypeError:
msg = (
"The Extension Array class for type {dtype} does not "
"yet support the unique-method with "
'"return_inverse=True".'.format(dtype=type(values))
)
raise NotImplementedError(msg)
return values.unique()

original = values
htable, _, values, dtype, ndtype = _get_hashtable_algo(values)

table = htable(len(values))
uniques = table.unique(values)
uniques = _reconstruct_data(uniques, original.dtype, original)
if return_inverse:
uniques, inverse = table.unique(values, return_inverse=True)
else:
uniques = table.unique(values)

uniques = _reconstruct_data(uniques, dtype, original)

if return_inverse:
return uniques, inverse
return uniques


Expand Down
27 changes: 23 additions & 4 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -2274,7 +2274,7 @@ def mode(self, dropna=True):
codes = sorted(htable.mode_int64(ensure_int64(codes), dropna))
return self._constructor(values=codes, dtype=self.dtype, fastpath=True)

def unique(self):
def unique(self, return_inverse=False):
"""
Return the ``Categorical`` which ``categories`` and ``codes`` are
unique. Unused categories are NOT returned.
Expand All @@ -2284,9 +2284,21 @@ def unique(self):
- ordered category: values are sorted by appearance order, categories
keeps existing order.

Parameters
----------
return_inverse : boolean, default False
Whether to return the inverse of the unique values. If True, the
output will be a tuple of two np.ndarray. The second component
contains the mapping between the indices of the elements in the
calling Categorical and their locations in the unique values.

.. versionadded:: 0.25.0

Returns
-------
unique values : ``Categorical``
uniques : ``Categorical``
inverse : np.ndarray (if `return_inverse=True`)
The inverse from the `uniques` back to the calling ``Categorical``.

Examples
--------
Expand Down Expand Up @@ -2318,7 +2330,10 @@ def unique(self):
"""

# unlike np.unique, unique1d does not sort
unique_codes = unique1d(self.codes)
if return_inverse:
unique_codes, inverse = unique1d(self.codes, return_inverse=True)
else:
unique_codes = unique1d(self.codes, return_inverse=False)
cat = self.copy()

# keep nan in codes
Expand All @@ -2328,7 +2343,11 @@ def unique(self):
take_codes = unique_codes[unique_codes != -1]
if self.ordered:
take_codes = np.sort(take_codes)
return cat.set_categories(cat.categories.take(take_codes))
result = cat.set_categories(cat.categories.take(take_codes))

if return_inverse:
return result, inverse
return result

def _values_for_factorize(self):
codes = self.codes.astype("int64")
Expand Down