Skip to content

Commit

Permalink
BUG: _nsorted incorrect with duplicated values in index (pandas-dev#1…
Browse files Browse the repository at this point in the history
…3412)

Add note to whatsnew

Add nlargest benchmark

Add tests for Series

organize nsorted methods

pep 8 fixes

passed test and pep8

add docstrings
  • Loading branch information
mroeschke committed Nov 25, 2016
1 parent 22d982a commit 0e6ea02
Show file tree
Hide file tree
Showing 7 changed files with 81 additions and 16 deletions.
11 changes: 11 additions & 0 deletions asv_bench/benchmarks/frame_methods.py
Expand Up @@ -1012,3 +1012,14 @@ def setup(self):

def time_frame_quantile_axis1(self):
self.df.quantile([0.1, 0.5], axis=1)


class frame_nlargest(object):
goal_time = 0.2

def setup(self):
self.df = DataFrame(np.random.randn(1000, 3),
columns=list('ABC'))

def time_frame_nlargest(self):
self.df.nlargest(100, 'A')
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.19.2.txt
Expand Up @@ -52,7 +52,7 @@ Bug Fixes
- Bug in resampling a ``DatetimeIndex`` in local TZ, covering a DST change, which would raise ``AmbiguousTimeError`` (:issue:`14682`)



- Bug in ``DataFrame.nlargest`` and ``DataFrame.nsmallest`` when the index had duplicate values (:issue:`13412`)



Expand Down
30 changes: 28 additions & 2 deletions pandas/core/algorithms.py
Expand Up @@ -684,11 +684,12 @@ def select_n_slow(dropped, n, keep, method):
_select_methods = {'nsmallest': nsmallest, 'nlargest': nlargest}


def select_n(series, n, keep, method):
"""Implement n largest/smallest.
def select_n_series(series, n, keep, method):
"""Implement n largest/smallest for pandas Series
Parameters
----------
series : pandas.Series object
n : int
keep : {'first', 'last'}, default 'first'
method : str, {'nlargest', 'nsmallest'}
Expand Down Expand Up @@ -717,6 +718,31 @@ def select_n(series, n, keep, method):
return dropped.iloc[inds]


def select_n_frame(frame, columns, n, method, keep):
"""Implement n largest/smallest for pandas DataFrame
Parameters
----------
series : pandas.DataFrame object
columns : list or str
n : int
keep : {'first', 'last'}, default 'first'
method : str, {'nlargest', 'nsmallest'}
Returns
-------
nordered : DataFrame
"""
from pandas.core.series import Series
if not is_list_like(columns):
columns = [columns]
columns = list(columns)
ser = getattr(frame[columns[0]], method)(n, keep=keep)
if isinstance(ser, Series):
ser = ser.to_frame()
return ser.merge(frame, on=columns[0], left_index=True)[frame.columns]


def _finalize_nsmallest(arr, kth_val, n, keep, narr):
ns, = np.nonzero(arr <= kth_val)
inds = ns[arr[ns].argsort(kind='mergesort')][:n]
Expand Down
13 changes: 2 additions & 11 deletions pandas/core/frame.py
Expand Up @@ -3390,15 +3390,6 @@ def sortlevel(self, level=0, axis=0, ascending=True, inplace=False,
return self.sort_index(level=level, axis=axis, ascending=ascending,
inplace=inplace, sort_remaining=sort_remaining)

def _nsorted(self, columns, n, method, keep):
if not is_list_like(columns):
columns = [columns]
columns = list(columns)
ser = getattr(self[columns[0]], method)(n, keep=keep)
ascending = dict(nlargest=False, nsmallest=True)[method]
return self.loc[ser.index].sort_values(columns, ascending=ascending,
kind='mergesort')

def nlargest(self, n, columns, keep='first'):
"""Get the rows of a DataFrame sorted by the `n` largest
values of `columns`.
Expand Down Expand Up @@ -3431,7 +3422,7 @@ def nlargest(self, n, columns, keep='first'):
1 10 b 2
2 8 d NaN
"""
return self._nsorted(columns, n, 'nlargest', keep)
return algos.select_n_frame(self, columns, n, 'nlargest', keep)

def nsmallest(self, n, columns, keep='first'):
"""Get the rows of a DataFrame sorted by the `n` smallest
Expand Down Expand Up @@ -3465,7 +3456,7 @@ def nsmallest(self, n, columns, keep='first'):
0 1 a 1
2 8 d NaN
"""
return self._nsorted(columns, n, 'nsmallest', keep)
return algos.select_n_frame(self, columns, n, 'nsmallest', keep)

def swaplevel(self, i=-2, j=-1, axis=0):
"""
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/series.py
Expand Up @@ -1935,7 +1935,7 @@ def nlargest(self, n=5, keep='first'):
>>> s = pd.Series(np.random.randn(1e6))
>>> s.nlargest(10) # only sorts up to the N requested
"""
return algos.select_n(self, n=n, keep=keep, method='nlargest')
return algos.select_n_series(self, n=n, keep=keep, method='nlargest')

@deprecate_kwarg('take_last', 'keep', mapping={True: 'last',
False: 'first'})
Expand Down Expand Up @@ -1973,7 +1973,7 @@ def nsmallest(self, n=5, keep='first'):
>>> s = pd.Series(np.random.randn(1e6))
>>> s.nsmallest(10) # only sorts up to the N requested
"""
return algos.select_n(self, n=n, keep=keep, method='nsmallest')
return algos.select_n_series(self, n=n, keep=keep, method='nsmallest')

def sortlevel(self, level=0, ascending=True, sort_remaining=True):
"""
Expand Down
28 changes: 28 additions & 0 deletions pandas/tests/frame/test_analytics.py
Expand Up @@ -1323,6 +1323,34 @@ def test_nsmallest_multiple_columns(self):
expected = df.sort_values(['a', 'c']).head(5)
tm.assert_frame_equal(result, expected)

def test_nsmallest_nlargest_duplicate_index(self):
df = pd.DataFrame({'a': [1, 2, 3, 4],
'b': [4, 3, 2, 1],
'c': [0, 1, 2, 3]},
index=[0, 0, 1, 1])
result = df.nsmallest(4, 'a')
expected = df.sort_values('a').head(4)
tm.assert_frame_equal(result, expected)

result = df.nlargest(4, 'a')
expected = df.sort_values('a', ascending=False).head(4)
tm.assert_frame_equal(result, expected)

result = df.nsmallest(4, ['a', 'c'])
expected = df.sort_values(['a', 'c']).head(4)
tm.assert_frame_equal(result, expected)

result = df.nsmallest(4, ['c', 'a'])
expected = df.sort_values(['c', 'a']).head(4)
tm.assert_frame_equal(result, expected)

result = df.nlargest(4, ['a', 'c'])
expected = df.sort_values(['a', 'c'], ascending=False).head(4)
tm.assert_frame_equal(result, expected)

result = df.nlargest(4, ['c', 'a'])
expected = df.sort_values(['c', 'a'], ascending=False).head(4)
tm.assert_frame_equal(result, expected)
# ----------------------------------------------------------------------
# Isin

Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/series/test_analytics.py
Expand Up @@ -1515,6 +1515,15 @@ def test_nsmallest_nlargest(self):
with tm.assertRaisesRegexp(ValueError, msg):
s.nlargest(keep='invalid')

# GH 13412
s = Series([1, 4, 3, 2], index=[0, 0, 1, 1])
result = s.nlargest(3)
expected = s.sort_values(ascending=False).head(3)
assert_series_equal(result, expected)
result = s.nsmallest(3)
expected = s.sort_values().head(3)
assert_series_equal(result, expected)

def test_sortlevel(self):
mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC'))
s = Series([1, 2], mi)
Expand Down

0 comments on commit 0e6ea02

Please sign in to comment.