diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index a21dee2e612d26..df73a474b26837 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -1012,3 +1012,14 @@ def setup(self): def time_frame_quantile_axis1(self): self.df.quantile([0.1, 0.5], axis=1) + + +class frame_nlargest(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(1000, 3), + columns=list('ABC')) + + def time_frame_nlargest(self): + self.df.nlargest(100, 'A') diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index 5a255d1e62043b..dbc50097fb746c 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -52,7 +52,7 @@ Bug Fixes - Bug in resampling a ``DatetimeIndex`` in local TZ, covering a DST change, which would raise ``AmbiguousTimeError`` (:issue:`14682`) - +- Bug in ``DataFrame.nlargest`` and ``DataFrame.nsmallest`` when the index had duplicate values (:issue:`13412`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 8644d4568e44d9..deb5ab60ac13ea 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -684,11 +684,12 @@ def select_n_slow(dropped, n, keep, method): _select_methods = {'nsmallest': nsmallest, 'nlargest': nlargest} -def select_n(series, n, keep, method): - """Implement n largest/smallest. +def select_n_series(series, n, keep, method): + """Implement n largest/smallest for pandas Series Parameters ---------- + series : pandas.Series object n : int keep : {'first', 'last'}, default 'first' method : str, {'nlargest', 'nsmallest'} @@ -717,6 +718,31 @@ def select_n(series, n, keep, method): return dropped.iloc[inds] +def select_n_frame(frame, columns, n, method, keep): + """Implement n largest/smallest for pandas DataFrame + + Parameters + ---------- + series : pandas.DataFrame object + columns : list or str + n : int + keep : {'first', 'last'}, default 'first' + method : str, {'nlargest', 'nsmallest'} + + Returns + ------- + nordered : DataFrame + """ + from pandas.core.series import Series + if not is_list_like(columns): + columns = [columns] + columns = list(columns) + ser = getattr(frame[columns[0]], method)(n, keep=keep) + if isinstance(ser, Series): + ser = ser.to_frame() + return ser.merge(frame, on=columns[0], left_index=True)[frame.columns] + + def _finalize_nsmallest(arr, kth_val, n, keep, narr): ns, = np.nonzero(arr <= kth_val) inds = ns[arr[ns].argsort(kind='mergesort')][:n] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index bf1ff28cd63b10..a6a821212e6501 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3390,15 +3390,6 @@ def sortlevel(self, level=0, axis=0, ascending=True, inplace=False, return self.sort_index(level=level, axis=axis, ascending=ascending, inplace=inplace, sort_remaining=sort_remaining) - def _nsorted(self, columns, n, method, keep): - if not is_list_like(columns): - columns = [columns] - columns = list(columns) - ser = getattr(self[columns[0]], method)(n, keep=keep) - ascending = dict(nlargest=False, nsmallest=True)[method] - return self.loc[ser.index].sort_values(columns, ascending=ascending, - kind='mergesort') - def nlargest(self, n, columns, keep='first'): """Get the rows of a DataFrame sorted by the `n` largest values of `columns`. @@ -3431,7 +3422,7 @@ def nlargest(self, n, columns, keep='first'): 1 10 b 2 2 8 d NaN """ - return self._nsorted(columns, n, 'nlargest', keep) + return algos.select_n_frame(self, columns, n, 'nlargest', keep) def nsmallest(self, n, columns, keep='first'): """Get the rows of a DataFrame sorted by the `n` smallest @@ -3465,7 +3456,7 @@ def nsmallest(self, n, columns, keep='first'): 0 1 a 1 2 8 d NaN """ - return self._nsorted(columns, n, 'nsmallest', keep) + return algos.select_n_frame(self, columns, n, 'nsmallest', keep) def swaplevel(self, i=-2, j=-1, axis=0): """ diff --git a/pandas/core/series.py b/pandas/core/series.py index 105e39562f5611..94d2069e2a5b3a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1935,7 +1935,7 @@ def nlargest(self, n=5, keep='first'): >>> s = pd.Series(np.random.randn(1e6)) >>> s.nlargest(10) # only sorts up to the N requested """ - return algos.select_n(self, n=n, keep=keep, method='nlargest') + return algos.select_n_series(self, n=n, keep=keep, method='nlargest') @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) @@ -1973,7 +1973,7 @@ def nsmallest(self, n=5, keep='first'): >>> s = pd.Series(np.random.randn(1e6)) >>> s.nsmallest(10) # only sorts up to the N requested """ - return algos.select_n(self, n=n, keep=keep, method='nsmallest') + return algos.select_n_series(self, n=n, keep=keep, method='nsmallest') def sortlevel(self, level=0, ascending=True, sort_remaining=True): """ diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index e73d3c58aea854..8021ac52a2ad91 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1323,6 +1323,34 @@ def test_nsmallest_multiple_columns(self): expected = df.sort_values(['a', 'c']).head(5) tm.assert_frame_equal(result, expected) + def test_nsmallest_nlargest_duplicate_index(self): + df = pd.DataFrame({'a': [1, 2, 3, 4], + 'b': [4, 3, 2, 1], + 'c': [0, 1, 2, 3]}, + index=[0, 0, 1, 1]) + result = df.nsmallest(4, 'a') + expected = df.sort_values('a').head(4) + tm.assert_frame_equal(result, expected) + + result = df.nlargest(4, 'a') + expected = df.sort_values('a', ascending=False).head(4) + tm.assert_frame_equal(result, expected) + + result = df.nsmallest(4, ['a', 'c']) + expected = df.sort_values(['a', 'c']).head(4) + tm.assert_frame_equal(result, expected) + + result = df.nsmallest(4, ['c', 'a']) + expected = df.sort_values(['c', 'a']).head(4) + tm.assert_frame_equal(result, expected) + + result = df.nlargest(4, ['a', 'c']) + expected = df.sort_values(['a', 'c'], ascending=False).head(4) + tm.assert_frame_equal(result, expected) + + result = df.nlargest(4, ['c', 'a']) + expected = df.sort_values(['c', 'a'], ascending=False).head(4) + tm.assert_frame_equal(result, expected) # ---------------------------------------------------------------------- # Isin diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 6de1a68464436b..26c2220c4811a1 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1515,6 +1515,15 @@ def test_nsmallest_nlargest(self): with tm.assertRaisesRegexp(ValueError, msg): s.nlargest(keep='invalid') + # GH 13412 + s = Series([1, 4, 3, 2], index=[0, 0, 1, 1]) + result = s.nlargest(3) + expected = s.sort_values(ascending=False).head(3) + assert_series_equal(result, expected) + result = s.nsmallest(3) + expected = s.sort_values().head(3) + assert_series_equal(result, expected) + def test_sortlevel(self): mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC')) s = Series([1, 2], mi)