pandas-dev · h-vetinari · Feb 1, 2019 · Feb 1, 2019 · Feb 1, 2019 · Feb 1, 2019
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -94,6 +94,25 @@ of the Series or columns of a DataFrame will also have string dtype.
 We recommend explicitly using the ``string`` data type when working with strings.
 See :ref:`text.types` for more.
 
+
+.. _whatsnew_1000.enhancements.unique:
+
+Changes to the ``unique``-method
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The method :meth:`pandas.unique` now supports the keyword ``return_inverse``, which, if passed,
+makes the output a tuple where the second component is an ndarray that contains the
+mapping from the indices of the values to their location in the return unique values.
+
+.. ipython:: python
+
+    idx = pd.Index([1, 0, 0, 1])
+    uniques, inverse = pd.unique(idx, return_inverse=True)
+    uniques
+    inverse
+    reconstruct = pd.Index(uniques[inverse])
+    reconstruct.equals(idx)
+
 .. _whatsnew_1000.enhancements.other:
 
 Other enhancements

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -306,7 +306,7 @@ def match(to_match, values, na_sentinel=-1):
     return result
 
 
-def unique(values):
+def unique(values, return_inverse=False):
     """
     Hash table-based unique. Uniques are returned in order
     of appearance. This does NOT sort.
@@ -316,6 +316,13 @@ def unique(values):
     Parameters
     ----------
     values : 1d array-like
+    return_inverse : boolean, default False
+        Whether to return the inverse of the unique values. If True, the
+        output will be a tuple of two np.ndarray. The second component
+        contains the mapping between the indices of the elements in the
+        calling Categorical and their locations in the unique values.
+
+        .. versionadded:: 1.0.0
 
     Returns
     -------
@@ -384,19 +391,47 @@ def unique(values):
     >>> pd.unique([('a', 'b'), ('b', 'a'), ('a', 'c'), ('b', 'a')])
     array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object)
     """
+    from pandas import Index
 
     values = _ensure_arraylike(values)
 
     if is_extension_array_dtype(values):
         # Dispatch to extension dtype's unique.
+        if return_inverse:
+            # as long as return_inverse is not part of the EA.unique contract,
+            # test if this works
+            try:
+                # make sure that we're not calling from an Index/Series
+                # container, as these do not support return_inverse yet
+                ea_val = getattr(values, "array", values)
+                result, inverse = ea_val.unique(return_inverse=return_inverse)
+
+                if is_categorical_dtype(values) and isinstance(values, Index):
+                    # pd.unique(CategoricalIndex) returns Index not Categorical
+                    result = Index(result)
+                return result, inverse
+            except TypeError:
+                msg = (
+                    "The Extension Array class for type {dtype} does not "
+                    "yet support the unique-method with "
+                    '"return_inverse=True".'.format(dtype=type(values))
+                )
+                raise NotImplementedError(msg)
         return values.unique()
 
     original = values
     htable, _, values, dtype, ndtype = _get_hashtable_algo(values)
 
     table = htable(len(values))
-    uniques = table.unique(values)
-    uniques = _reconstruct_data(uniques, original.dtype, original)
+    if return_inverse:
+        uniques, inverse = table.unique(values, return_inverse=True)
+    else:
+        uniques = table.unique(values)
+
+    uniques = _reconstruct_data(uniques, dtype, original)
+
+    if return_inverse:
+        return uniques, inverse
     return uniques
 
 

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -2274,7 +2274,7 @@ def mode(self, dropna=True):
         codes = sorted(htable.mode_int64(ensure_int64(codes), dropna))
         return self._constructor(values=codes, dtype=self.dtype, fastpath=True)
 
-    def unique(self):
+    def unique(self, return_inverse=False):
         """
         Return the ``Categorical`` which ``categories`` and ``codes`` are
         unique. Unused categories are NOT returned.
@@ -2284,9 +2284,21 @@ def unique(self):
         - ordered category: values are sorted by appearance order, categories
           keeps existing order.
 
+        Parameters
+        ----------
+        return_inverse : boolean, default False
+            Whether to return the inverse of the unique values. If True, the
+            output will be a tuple of two np.ndarray. The second component
+            contains the mapping between the indices of the elements in the
+            calling Categorical and their locations in the unique values.
+
+            .. versionadded:: 0.25.0
+
         Returns
         -------
-        unique values : ``Categorical``
+        uniques : ``Categorical``
+        inverse : np.ndarray (if `return_inverse=True`)
+            The inverse from the `uniques` back to the calling ``Categorical``.
 
         Examples
         --------
@@ -2318,7 +2330,10 @@ def unique(self):
         """
 
         # unlike np.unique, unique1d does not sort
-        unique_codes = unique1d(self.codes)
+        if return_inverse:
+            unique_codes, inverse = unique1d(self.codes, return_inverse=True)
+        else:
+            unique_codes = unique1d(self.codes, return_inverse=False)
         cat = self.copy()
 
         # keep nan in codes
@@ -2328,7 +2343,11 @@ def unique(self):
         take_codes = unique_codes[unique_codes != -1]
         if self.ordered:
             take_codes = np.sort(take_codes)
-        return cat.set_categories(cat.categories.take(take_codes))
+        result = cat.set_categories(cat.categories.take(take_codes))
+
+        if return_inverse:
+            return result, inverse
+        return result
 
     def _values_for_factorize(self):
         codes = self.codes.astype("int64")