Skip to content

Commit

Permalink
Implement DataFrame.last and Series.last functionality (#2121)
Browse files Browse the repository at this point in the history
Please see change to implement `DataFrame.last` and `Series.last` functionality similar to that available in pandas. Requirement raised in issue: databricks/koalas#1929

```python
>>> index = pd.date_range('2018-04-09', periods=4, freq='2D')
>>> ks_series = ks.Series([1, 2, 3, 4], index=index)
2018-04-09  1
2018-04-11  2
2018-04-13  3
2018-04-15  4
dtype: int64

>>> ks_series.last('3D')
2018-04-13  3
2018-04-15  4
dtype: int64
```

```python
>>> index = pd.date_range('2018-04-09', periods=4, freq='2D')
>>> pdf = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i)
>>> kdf = fs.from_pandas(pdf)
            A
2018-04-09  1
2018-04-11  2
2018-04-13  3
2018-04-15  4

 >>> kdf.last('3D')
            A
2018-04-13  3
2018-04-15  4      
```
  • Loading branch information
rising-star92 committed Mar 30, 2021
1 parent 8410914 commit e7bc2b6
Show file tree
Hide file tree
Showing 8 changed files with 126 additions and 2 deletions.
58 changes: 58 additions & 0 deletions databricks/koalas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
import pandas as pd
from pandas.api.types import is_list_like, is_dict_like, is_scalar
from pandas.api.extensions import ExtensionDtype
from pandas.tseries.frequencies import DateOffset, to_offset

if TYPE_CHECKING:
from pandas.io.formats.style import Styler
Expand Down Expand Up @@ -5670,6 +5671,63 @@ def head(self, n: int = 5) -> "DataFrame":
sdf = sdf.orderBy(NATURAL_ORDER_COLUMN_NAME)
return DataFrame(self._internal.with_new_sdf(sdf.limit(n)))

def last(self, offset: Union[str, DateOffset]) -> "DataFrame":
"""
Select final periods of time series data based on a date offset.
When having a DataFrame with dates as index, this function can
select the last few rows based on a date offset.
Parameters
----------
offset : str or DateOffset
The offset length of the data that will be selected. For instance,
'3D' will display all the rows having their index within the last 3 days.
Returns
-------
DataFrame
A subset of the caller.
Raises
------
TypeError
If the index is not a :class:`DatetimeIndex`
Examples
--------
>>> index = pd.date_range('2018-04-09', periods=4, freq='2D')
>>> kdf = ks.DataFrame({'A': [1, 2, 3, 4]}, index=index)
>>> kdf
A
2018-04-09 1
2018-04-11 2
2018-04-13 3
2018-04-15 4
Get the rows for the last 3 days:
>>> kdf.last('3D')
A
2018-04-13 3
2018-04-15 4
Notice the data for 3 last calendar days were returned, not the last
3 observed days in the dataset, and therefore data for 2018-04-11 was
not returned.
"""
# Check index type should be format DateTime
from databricks.koalas.indexes import DatetimeIndex

if not isinstance(self.index, DatetimeIndex):
raise TypeError("'last' only supports a DatetimeIndex")

offset = to_offset(offset)
from_date = self.index.max() - offset

return cast(DataFrame, self.loc[from_date:])

def pivot_table(
self, values=None, index=None, columns=None, aggfunc="mean", fill_value=None
) -> "DataFrame":
Expand Down
1 change: 0 additions & 1 deletion databricks/koalas/missing/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ class _MissingPandasLikeDataFrame(object):
first = _unsupported_function("first")
infer_objects = _unsupported_function("infer_objects")
interpolate = _unsupported_function("interpolate")
last = _unsupported_function("last")
lookup = _unsupported_function("lookup")
mode = _unsupported_function("mode")
reorder_levels = _unsupported_function("reorder_levels")
Expand Down
1 change: 0 additions & 1 deletion databricks/koalas/missing/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ class MissingPandasLikeSeries(object):
first = _unsupported_function("first")
infer_objects = _unsupported_function("infer_objects")
interpolate = _unsupported_function("interpolate")
last = _unsupported_function("last")
reorder_levels = _unsupported_function("reorder_levels")
resample = _unsupported_function("resample")
searchsorted = _unsupported_function("searchsorted")
Expand Down
48 changes: 48 additions & 0 deletions databricks/koalas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from pandas.io.formats.printing import pprint_thing
from pandas.api.types import is_list_like, is_hashable
from pandas.api.extensions import ExtensionDtype
from pandas.tseries.frequencies import DateOffset
import pyspark
from pyspark import sql as spark
from pyspark.sql import functions as F, Column
Expand Down Expand Up @@ -2218,6 +2219,53 @@ def head(self, n: int = 5) -> "Series":
"""
return first_series(self.to_frame().head(n)).rename(self.name)

def last(self, offset: Union[str, DateOffset]) -> "Series":
"""
Select final periods of time series data based on a date offset.
When having a Series with dates as index, this function can
select the last few elements based on a date offset.
Parameters
----------
offset : str or DateOffset
The offset length of the data that will be selected. For instance,
'3D' will display all the rows having their index within the last 3 days.
Returns
-------
Series
A subset of the caller.
Raises
------
TypeError
If the index is not a :class:`DatetimeIndex`
Examples
--------
>>> index = pd.date_range('2018-04-09', periods=4, freq='2D')
>>> ks_series = ks.Series([1, 2, 3, 4], index=index)
>>> ks_series
2018-04-09 1
2018-04-11 2
2018-04-13 3
2018-04-15 4
dtype: int64
Get the rows for the last 3 days:
>>> ks_series.last('3D')
2018-04-13 3
2018-04-15 4
dtype: int64
Notice the data for 3 last calendar days were returned, not the last
3 observed days in the dataset, and therefore data for 2018-04-11 was
not returned.
"""
return first_series(self.to_frame().last(offset)).rename(self.name)

# TODO: Categorical type isn't supported (due to PySpark's limitation) and
# some doctests related with timestamps were not added.
def unique(self) -> "Series":
Expand Down
10 changes: 10 additions & 0 deletions databricks/koalas/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

import numpy as np
import pandas as pd
from pandas.tseries.offsets import DateOffset
import pyspark
from pyspark import StorageLevel
from pyspark.ml.linalg import SparseVector
Expand Down Expand Up @@ -5202,6 +5203,15 @@ def test_last_valid_index(self):
kdf = ks.Series([]).to_frame()
self.assert_eq(pdf.last_valid_index(), kdf.last_valid_index())

def test_last(self):
index = pd.date_range("2018-04-09", periods=4, freq="2D")
pdf = pd.DataFrame([1, 2, 3, 4], index=index)
kdf = ks.from_pandas(pdf)
self.assert_eq(pdf.last("1D"), kdf.last("1D"))
self.assert_eq(pdf.last(DateOffset(days=1)), kdf.last(DateOffset(days=1)))
with self.assertRaisesRegex(TypeError, "'last' only supports a DatetimeIndex"):
ks.DataFrame([1, 2, 3, 4]).last("1D")

def test_first_valid_index(self):
pdf = pd.DataFrame(
{"a": [None, 2, 3, 2], "b": [None, 2.0, 3.0, 1.0], "c": [None, 200, 400, 200]},
Expand Down
8 changes: 8 additions & 0 deletions databricks/koalas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,14 @@ def test_head(self):
self.assert_eq(kser.head(-3), pser.head(-3))
self.assert_eq(kser.head(-10), pser.head(-10))

def test_last(self):
index = pd.date_range("2018-04-09", periods=4, freq="2D")
pd_input = pd.Series([1, 2, 3, 4], index=index)
ks_input = ks.Series([1, 2, 3, 4], index=index)
with self.assertRaises(TypeError):
self.kser.last("1D")
self.assert_eq(ks_input.last("1D"), pd_input.last("1D"))

def test_rename(self):
pser = pd.Series([1, 2, 3, 4, 5, 6, 7], name="x")
kser = ks.from_pandas(pser)
Expand Down
1 change: 1 addition & 0 deletions docs/source/reference/frame.rst
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ Reindexing / Selection / Label manipulation
DataFrame.equals
DataFrame.filter
DataFrame.head
DataFrame.last
DataFrame.rename
DataFrame.rename_axis
DataFrame.reset_index
Expand Down
1 change: 1 addition & 0 deletions docs/source/reference/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ Reindexing / Selection / Label manipulation
Series.idxmax
Series.idxmin
Series.isin
Series.last
Series.rename
Series.rename_axis
Series.reindex
Expand Down

0 comments on commit e7bc2b6

Please sign in to comment.