Skip to content

Commit

Permalink
API: add return_inverse to pd.unique
Browse files Browse the repository at this point in the history
  • Loading branch information
h-vetinari committed Dec 5, 2018
1 parent 4ae63aa commit d19f073
Show file tree
Hide file tree
Showing 4 changed files with 166 additions and 98 deletions.
18 changes: 18 additions & 0 deletions doc/source/whatsnew/v0.24.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,24 @@ Example:
See the :ref:`advanced docs on renaming<advanced.index_names>` for more details.


.. _whatsnew_0240.enhancements.unique:

Changes to the ``unique``-method
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

The method :meth:`pandas.unique` now supports the keyword ``return_inverse``, which, if passed,
makes the output a tuple where the second component is an ndarray that contains the
mapping from the indices of the values to their location in the return unique values.

.. ipython:: python
idx = pd.Index([1, 0, 0, 1])
uniques, inverse = pd.unique(idx, return_inverse=True)
uniques
inverse
reconstruct = pd.Index(uniques[inverse])
reconstruct.equals(idx)
.. _whatsnew_0240.enhancements.other:

Other Enhancements
Expand Down
29 changes: 27 additions & 2 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ def match(to_match, values, na_sentinel=-1):
return result


def unique(values):
def unique(values, return_inverse=False):
"""
Hash table-based unique. Uniques are returned in order
of appearance. This does NOT sort.
Expand Down Expand Up @@ -344,18 +344,41 @@ def unique(values):
pandas.Index.unique
pandas.Series.unique
"""
from pandas import Index

values = _ensure_arraylike(values)

if is_extension_array_dtype(values):
# Dispatch to extension dtype's unique.
if return_inverse:
# as long as return_inverse is not part of the EA.unique contract,
# test if this works
try:
# make sure that we're not calling from an Index/Series
# container, as these do not support return_inverse yet
ea_val = getattr(values, 'array', values)
result, inverse = ea_val.unique(return_inverse=return_inverse)

if is_categorical_dtype(values) and isinstance(values, Index):
# pd.unique(CategoricalIndex) returns Index not Categorical
result = Index(result)
return result, inverse
except TypeError:
msg = ('The Extension Array class for type {dtype} does not '
'yet support the unique-method with '
'"return_inverse=True".'.format(dtype=type(values)))
raise NotImplementedError(msg)
return values.unique()

original = values
htable, _, values, dtype, ndtype = _get_hashtable_algo(values)

table = htable(len(values))
uniques = table.unique(values)
if return_inverse:
uniques, inverse = table.unique(values, return_inverse=True)
else:
uniques = table.unique(values)

uniques = _reconstruct_data(uniques, dtype, original)

if isinstance(original, ABCSeries) and is_datetime64tz_dtype(dtype):
Expand All @@ -365,6 +388,8 @@ def unique(values):
# TODO: it must return DatetimeArray with tz in pandas 2.0
uniques = uniques.astype(object).values

if return_inverse:
return uniques, inverse
return uniques


Expand Down
28 changes: 24 additions & 4 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -2249,7 +2249,7 @@ def mode(self, dropna=True):
codes = sorted(htable.mode_int64(ensure_int64(codes), dropna))
return self._constructor(values=codes, dtype=self.dtype, fastpath=True)

def unique(self):
def unique(self, return_inverse=False):
"""
Return the ``Categorical`` which ``categories`` and ``codes`` are
unique. Unused categories are NOT returned.
Expand All @@ -2259,9 +2259,22 @@ def unique(self):
- ordered category: values are sorted by appearance order, categories
keeps existing order.
Parameters
----------
return_inverse : boolean, default False
Whether to return the inverse of the unique values. If True, the
output will be a tuple where the second component is again an
np.ndarray that contains the mapping between the indices of the
elements in the calling Categorical and their locations in the
unique values. See examples for how to reconstruct.
.. versionadded:: 0.24.0
Returns
-------
unique values : ``Categorical``
uniques : ``Categorical``
inverse : np.ndarray (if `return_inverse=True`)
The inverse from the `uniques` back to the calling ``Categorical``.
Examples
--------
Expand Down Expand Up @@ -2293,7 +2306,10 @@ def unique(self):
"""

# unlike np.unique, unique1d does not sort
unique_codes = unique1d(self.codes)
if return_inverse:
unique_codes, inverse = unique1d(self.codes, return_inverse=True)
else:
unique_codes = unique1d(self.codes, return_inverse=False)
cat = self.copy()

# keep nan in codes
Expand All @@ -2303,7 +2319,11 @@ def unique(self):
take_codes = unique_codes[unique_codes != -1]
if self.ordered:
take_codes = np.sort(take_codes)
return cat.set_categories(cat.categories.take(take_codes))
result = cat.set_categories(cat.categories.take(take_codes))

if return_inverse:
return result, inverse
return result

def _values_for_factorize(self):
codes = self.codes.astype('int64')
Expand Down
189 changes: 97 additions & 92 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,20 @@
from pandas.util.testing import assert_almost_equal


def assert_series_or_index_or_array_or_categorical_equal(left, right):
if isinstance(left, Series):
tm.assert_series_equal(left, right)
elif isinstance(left, Index):
tm.assert_index_equal(left, right)
elif isinstance(left, np.ndarray):
tm.assert_numpy_array_equal(left, right)
elif isinstance(left, Categorical):
tm.assert_categorical_equal(left, right)
else:
# will fail
assert isinstance(left, (Series, Index, np.ndarray, Categorical))


class TestMatch(object):

def test_ints(self):
Expand Down Expand Up @@ -321,17 +335,22 @@ def test_parametrized_factorize_na_value(self, data, na_value):

class TestUnique(object):

def test_ints(self):
arr = np.random.randint(0, 100, size=50)
def test_unique_inverse(self, any_numpy_dtype):
dtype = any_numpy_dtype
arr = np.random.randint(0, 100, size=50).astype(dtype)

result = algos.unique(arr)
assert isinstance(result, np.ndarray)

def test_objects(self):
arr = np.random.randint(0, 100, size=50).astype('O')
# reuse result as expected outcome of return_inverse case
expected_uniques = result.copy()

result = algos.unique(arr)
assert isinstance(result, np.ndarray)
result_uniques, result_inverse = algos.unique(arr, return_inverse=True)
tm.assert_numpy_array_equal(result_uniques, expected_uniques)

# reconstruction can only work if inverse is correct
reconstr = result_uniques[result_inverse]
tm.assert_numpy_array_equal(reconstr, arr, check_dtype=False)

def test_object_refcount_bug(self):
lst = ['A', 'B', 'C', 'D', 'E']
Expand Down Expand Up @@ -376,24 +395,26 @@ def test_datetime64_dtype_array_returned(self):
tm.assert_numpy_array_equal(result, expected)
assert result.dtype == expected.dtype

def test_timedelta64_dtype_array_returned(self):
@pytest.mark.parametrize('box', [Index, Series, np.array])
def test_timedelta64_dtype_array_returned(self, box):
# GH 9431
expected = np.array([31200, 45678, 10000], dtype='m8[ns]')

td_index = pd.to_timedelta([31200, 45678, 31200, 10000, 45678])
result = algos.unique(td_index)
tm.assert_numpy_array_equal(result, expected)
assert result.dtype == expected.dtype
obj = box(td_index)

s = Series(td_index)
result = algos.unique(s)
result = algos.unique(obj)
tm.assert_numpy_array_equal(result, expected)
assert result.dtype == expected.dtype

arr = s.values
result = algos.unique(arr)
tm.assert_numpy_array_equal(result, expected)
assert result.dtype == expected.dtype
# reuse result as expected outcome of return_inverse case
expected_uniques = result.copy()

result_uniques, result_inverse = algos.unique(obj, return_inverse=True)
tm.assert_numpy_array_equal(result_uniques, expected_uniques)

# reconstruction can only work if inverse is correct
reconstr = box(result_uniques[result_inverse])
assert_series_or_index_or_array_or_categorical_equal(reconstr, obj)

def test_uint64_overflow(self):
s = Series([1, 2, 2**63, 2**63], dtype=np.uint64)
Expand All @@ -406,78 +427,80 @@ def test_nan_in_object_array(self):
expected = np.array(['a', np.nan, 'c'], dtype=object)
tm.assert_numpy_array_equal(result, expected)

def test_categorical(self):
result_uniques, result_inverse = pd.unique(duplicated_items,
return_inverse=True)
expected_inverse = np.array([0, 1, 2, 2], dtype='int64')
tm.assert_numpy_array_equal(result_inverse, expected_inverse)

@pytest.mark.parametrize('ordered', [True, False])
@pytest.mark.parametrize('box', [lambda x: x, Series, Index],
ids=['Categorical', 'Series', 'Index'])
@pytest.mark.parametrize('method', [lambda x, **kwargs: x.unique(**kwargs),
pd.unique],
ids=['classmethod', 'toplevel'])
def test_categorical(self, method, box, ordered):

# we are expecting to return in the order
# of appearance
expected = Categorical(list('bac'), categories=list('bac'))
categories = list('abc') if ordered else list('bac')
expected = Categorical(list('bac'), categories=categories,
ordered=ordered)

# we are expecting to return in the order
# of the categories
expected_o = Categorical(
list('bac'), categories=list('abc'), ordered=True)
# Index.unique always returns Index
# pd.unique(Index) stays Index (only) for Categorical
expected = box(expected) if box == Index else expected

# GH 15939
c = Categorical(list('baabc'))
result = c.unique()
tm.assert_categorical_equal(result, expected)
c = box(Categorical(list('baabc'), categories=categories,
ordered=ordered))
result = method(c)

result = algos.unique(c)
tm.assert_categorical_equal(result, expected)
assert_series_or_index_or_array_or_categorical_equal(result, expected)

c = Categorical(list('baabc'), ordered=True)
result = c.unique()
tm.assert_categorical_equal(result, expected_o)
if method == pd.unique:
# [Series/Index].unique do not yet support return_inverse=True

result = algos.unique(c)
tm.assert_categorical_equal(result, expected_o)
# reuse result as expected outcome of return_inverse case
expected_uniques = result.copy()
result_uniques, result_inverse = method(c, return_inverse=True)

# Series of categorical dtype
s = Series(Categorical(list('baabc')), name='foo')
result = s.unique()
tm.assert_categorical_equal(result, expected)
assert_series_or_index_or_array_or_categorical_equal(
result_uniques, expected_uniques)

result = pd.unique(s)
tm.assert_categorical_equal(result, expected)
# reconstruction can only work if inverse is correct
reconstr = box(result_uniques[result_inverse])
assert_series_or_index_or_array_or_categorical_equal(reconstr, c)

# CI -> return CI
ci = CategoricalIndex(Categorical(list('baabc'),
categories=list('bac')))
expected = CategoricalIndex(expected)
result = ci.unique()
tm.assert_index_equal(result, expected)
@pytest.mark.parametrize('box', [Series, Index])
@pytest.mark.parametrize('method', [lambda x, **kwargs: x.unique(**kwargs),
pd.unique],
ids=['classmethod', 'toplevel'])
def test_datetime64tz_aware(self, method, box):
# GH 15939

result = pd.unique(ci)
tm.assert_index_equal(result, expected)
ts = Timestamp('20160101', tz='US/Eastern')
obj = box([ts, ts])

def test_datetime64tz_aware(self):
# GH 15939
if box == Series:
expected = np.array([Timestamp('2016-01-01 00:00:00-0500',
tz='US/Eastern')], dtype=object)
else: # Index
expected = Index([ts])

result = Series(
Index([Timestamp('20160101', tz='US/Eastern'),
Timestamp('20160101', tz='US/Eastern')])).unique()
expected = np.array([Timestamp('2016-01-01 00:00:00-0500',
tz='US/Eastern')], dtype=object)
tm.assert_numpy_array_equal(result, expected)
result = method(obj)
assert_series_or_index_or_array_or_categorical_equal(result, expected)

result = Index([Timestamp('20160101', tz='US/Eastern'),
Timestamp('20160101', tz='US/Eastern')]).unique()
expected = DatetimeIndex(['2016-01-01 00:00:00'],
dtype='datetime64[ns, US/Eastern]', freq=None)
tm.assert_index_equal(result, expected)

result = pd.unique(
Series(Index([Timestamp('20160101', tz='US/Eastern'),
Timestamp('20160101', tz='US/Eastern')])))
expected = np.array([Timestamp('2016-01-01 00:00:00-0500',
tz='US/Eastern')], dtype=object)
tm.assert_numpy_array_equal(result, expected)
if method == pd.unique:
# [Series/Index].unique do not yet support return_inverse=True

# reuse result as expected outcome of return_inverse case
expected_uniques = result.copy()
result_uniques, result_inverse = method(obj, return_inverse=True)

result = pd.unique(Index([Timestamp('20160101', tz='US/Eastern'),
Timestamp('20160101', tz='US/Eastern')]))
expected = DatetimeIndex(['2016-01-01 00:00:00'],
dtype='datetime64[ns, US/Eastern]', freq=None)
tm.assert_index_equal(result, expected)
assert_series_or_index_or_array_or_categorical_equal(
result_uniques, expected_uniques)

# reconstruction can only work if inverse is correct
reconstr = box(result_uniques[result_inverse])
assert_series_or_index_or_array_or_categorical_equal(reconstr, obj)

def test_order_of_appearance(self):
# 9346
Expand All @@ -491,28 +514,10 @@ def test_order_of_appearance(self):
tm.assert_numpy_array_equal(result,
np.array([2, 1], dtype='int64'))

result = pd.unique(Series([Timestamp('20160101'),
Timestamp('20160101')]))
expected = np.array(['2016-01-01T00:00:00.000000000'],
dtype='datetime64[ns]')
tm.assert_numpy_array_equal(result, expected)

result = pd.unique(Index(
[Timestamp('20160101', tz='US/Eastern'),
Timestamp('20160101', tz='US/Eastern')]))
expected = DatetimeIndex(['2016-01-01 00:00:00'],
dtype='datetime64[ns, US/Eastern]',
freq=None)
tm.assert_index_equal(result, expected)

result = pd.unique(list('aabc'))
expected = np.array(['a', 'b', 'c'], dtype=object)
tm.assert_numpy_array_equal(result, expected)

result = pd.unique(Series(Categorical(list('aabc'))))
expected = Categorical(list('abc'))
tm.assert_categorical_equal(result, expected)

@pytest.mark.parametrize("arg ,expected", [
(('1', '1', '2'), np.array(['1', '2'], dtype=object)),
(('foo',), np.array(['foo'], dtype=object))
Expand Down

0 comments on commit d19f073

Please sign in to comment.